From 0a49f9ce77d7fff83e40cdb456fc30891b68ae45 Mon Sep 17 00:00:00 2001
From: Mark Berrill <berrillma@ornl.gov>
Date: Mon, 18 Mar 2019 09:42:44 -0400
Subject: [PATCH] Updating threadpool / StackTrace

---
 CMakeLists.txt                       |    6 +-
 IO/Mesh.cpp                          |    2 +-
 IO/Mesh.h                            |    3 +-
 IO/MeshDatabase.h                    |    2 +-
 IO/Reader.h                          |    2 +-
 IO/Writer.cpp                        |    2 +-
 StackTrace/ErrorHandlers.h           |   42 +
 StackTrace/Readme.txt                |    4 +
 StackTrace/StackTrace.cpp            | 2517 ++++++++++++++++++++++++++
 {common => StackTrace}/StackTrace.h  |  157 +-
 StackTrace/Utilities.cpp             |  296 +++
 StackTrace/Utilities.h               |   99 +
 StackTrace/string_view.h             |  193 ++
 analysis/Minkowski.cpp               |    6 +-
 analysis/Minkowski.h                 |    2 +-
 analysis/TwoPhase.cpp                |    7 +-
 analysis/TwoPhase.h                  |    7 +-
 cmake/SharedPtr.cmake                |  170 --
 common/Array.hpp                     |    5 +-
 common/MPI_Helpers.cpp               |   10 +-
 common/StackTrace.cpp                | 1876 -------------------
 common/Utilities.cpp                 |  295 ---
 common/Utilities.h                   |   85 +-
 tests/CMakeLists.txt                 |    1 -
 tests/TestWriter.cpp                 |    2 +-
 tests/testUtilities.cpp              |  145 --
 threadpool/Readme.txt                |    2 +
 threadpool/atomic_helpers.cpp        |   38 +
 threadpool/atomic_helpers.h          |    8 +-
 threadpool/atomic_list.h             |   54 +-
 threadpool/atomic_list.hpp           |   58 +-
 threadpool/test/CMakeLists.txt       |   16 -
 threadpool/test/test_atomic.cpp      |  154 --
 threadpool/test/test_atomic_list.cpp |  221 ---
 threadpool/test/test_thread_pool.cpp |  967 ----------
 threadpool/thread_pool.cpp           |  470 +++--
 threadpool/thread_pool.h             |  226 ++-
 threadpool/thread_pool.hpp           |  185 +-
 38 files changed, 3849 insertions(+), 4486 deletions(-)
 create mode 100644 StackTrace/ErrorHandlers.h
 create mode 100644 StackTrace/Readme.txt
 create mode 100644 StackTrace/StackTrace.cpp
 rename {common => StackTrace}/StackTrace.h (59%)
 create mode 100644 StackTrace/Utilities.cpp
 create mode 100644 StackTrace/Utilities.h
 create mode 100644 StackTrace/string_view.h
 delete mode 100644 cmake/SharedPtr.cmake
 delete mode 100644 common/StackTrace.cpp
 delete mode 100644 tests/testUtilities.cpp
 create mode 100644 threadpool/Readme.txt
 delete mode 100644 threadpool/test/CMakeLists.txt
 delete mode 100644 threadpool/test/test_atomic.cpp
 delete mode 100644 threadpool/test/test_atomic_list.cpp
 delete mode 100644 threadpool/test/test_thread_pool.cpp

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 2f055989..acc2c2dc 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -112,7 +112,7 @@ ENDIF()
 ADD_CUSTOM_TARGET( build-test )
 ADD_CUSTOM_TARGET( build-examples )
 ADD_CUSTOM_TARGET( check COMMAND  make test  )
-ADD_DISTCLEAN( analysis null_timer tests liblbpm-wia.* cpu gpu example common IO threadpool )
+ADD_DISTCLEAN( analysis null_timer tests liblbpm-wia.* cpu gpu example common IO threadpool StackTrace )
 
 
 # Check for CUDA
@@ -133,8 +133,6 @@ IF ( NOT ONLY_BUILD_DOCS )
     CONFIGURE_LBPM()
     CONFIGURE_TIMER( 0 "${${PROJ}_INSTALL_DIR}/null_timer" )
     CONFIGURE_LINE_COVERAGE()
-    INCLUDE( "${CMAKE_CURRENT_SOURCE_DIR}/cmake/SharedPtr.cmake" )
-    CONFIGURE_SHARED_PTR( "${${PROJ}_INSTALL_DIR}/include" "std" )
     # Set the external library link list
     SET( EXTERNAL_LIBS ${EXTERNAL_LIBS} ${TIMER_LIBS} )
 ENDIF()
@@ -156,6 +154,7 @@ IF ( NOT ONLY_BUILD_DOCS )
     ADD_PACKAGE_SUBDIRECTORY( analysis )
     ADD_PACKAGE_SUBDIRECTORY( IO )
     ADD_PACKAGE_SUBDIRECTORY( threadpool )
+    ADD_PACKAGE_SUBDIRECTORY( StackTrace )
     ADD_PACKAGE_SUBDIRECTORY( models )
     IF ( USE_CUDA )
         ADD_PACKAGE_SUBDIRECTORY( gpu )
@@ -164,7 +163,6 @@ IF ( NOT ONLY_BUILD_DOCS )
     ENDIF()
     INSTALL_LBPM_TARGET( lbpm-wia-library  )
     ADD_SUBDIRECTORY( tests )
-    ADD_SUBDIRECTORY( threadpool/test )
     ADD_SUBDIRECTORY( example )
     #ADD_SUBDIRECTORY( workflows )
     INSTALL_PROJ_LIB()
diff --git a/IO/Mesh.cpp b/IO/Mesh.cpp
index 742dac85..eb712296 100644
--- a/IO/Mesh.cpp
+++ b/IO/Mesh.cpp
@@ -1,8 +1,8 @@
 #include "Mesh.h"
 #include "common/Utilities.h"
-#include "shared_ptr.h"
 
 #include <limits>
+#include <memory>
 #include <stdint.h>
 
 namespace IO {
diff --git a/IO/Mesh.h b/IO/Mesh.h
index 604dddfd..b204675a 100644
--- a/IO/Mesh.h
+++ b/IO/Mesh.h
@@ -2,14 +2,13 @@
 #define MESH_INC
 
 #include <iostream>
+#include <memory>
 #include <string.h>
 #include <vector>
 
 #include "common/Array.h"
 #include "common/Communication.h"
 #include "analysis/PointList.h"
-#include "shared_ptr.h"
-
 
 
 namespace IO {
diff --git a/IO/MeshDatabase.h b/IO/MeshDatabase.h
index ad696260..9f544925 100644
--- a/IO/MeshDatabase.h
+++ b/IO/MeshDatabase.h
@@ -3,9 +3,9 @@
 
 #include "IO/Mesh.h" 
 #include "common/MPI_Helpers.h"
-#include "shared_ptr.h"
 
 #include <iostream>
+#include <memory>
 #include <string.h>
 #include <vector>
 #include <map>
diff --git a/IO/Reader.h b/IO/Reader.h
index ce8dba22..4230ff8f 100644
--- a/IO/Reader.h
+++ b/IO/Reader.h
@@ -2,12 +2,12 @@
 #define READER_INC
 
 #include <iostream>
+#include <memory>
 #include <string.h>
 #include <vector>
 
 #include "IO/Mesh.h"
 #include "IO/MeshDatabase.h"
-#include "shared_ptr.h"
 
 
 namespace IO {
diff --git a/IO/Writer.cpp b/IO/Writer.cpp
index bb522cf6..6581ad42 100644
--- a/IO/Writer.cpp
+++ b/IO/Writer.cpp
@@ -4,12 +4,12 @@
 #include "IO/silo.h"
 #include "common/MPI_Helpers.h"
 #include "common/Utilities.h"
-#include "shared_ptr.h"
 
 #include <sys/stat.h>
 #include <algorithm>
 #include <vector>
 #include <set>
+#include <memory>
 
 
 
diff --git a/StackTrace/ErrorHandlers.h b/StackTrace/ErrorHandlers.h
new file mode 100644
index 00000000..12b8d7de
--- /dev/null
+++ b/StackTrace/ErrorHandlers.h
@@ -0,0 +1,42 @@
+#ifndef included_StackTraceErrorHandlers
+#define included_StackTraceErrorHandlers
+
+
+#include "StackTrace/StackTrace.h"
+
+#include <functional>
+
+#include "mpi.h"
+
+
+namespace StackTrace
+{
+
+
+    /*!
+     * Set the error handler
+     * @param[in] abort     Function to terminate the program: abort(msg,type)
+     */
+    void setErrorHandler( std::function<void( const StackTrace::abort_error& )> abort );
+
+    //! Clear the error handler
+    void clearErrorHandler();
+
+
+    //! Set an error handler for MPI
+    void setMPIErrorHandler( MPI_Comm comm );
+
+    //! Clear an error handler for MPI
+    void clearMPIErrorHandler( MPI_Comm comm );
+
+
+    //! Initialize globalCallStack functionallity
+    void globalCallStackInitialize( MPI_Comm comm );
+
+    //! Clean up globalCallStack functionallity
+    void globalCallStackFinalize();
+
+
+} // namespace StackTrace
+
+#endif
diff --git a/StackTrace/Readme.txt b/StackTrace/Readme.txt
new file mode 100644
index 00000000..264fed62
--- /dev/null
+++ b/StackTrace/Readme.txt
@@ -0,0 +1,4 @@
+This directory contains code external code released with permission under the license of this project.
+
+Original code and license are availible at:
+https://bitbucket.org/mberrill/StackTrace
diff --git a/StackTrace/StackTrace.cpp b/StackTrace/StackTrace.cpp
new file mode 100644
index 00000000..e9292990
--- /dev/null
+++ b/StackTrace/StackTrace.cpp
@@ -0,0 +1,2517 @@
+#include "StackTrace/StackTrace.h"
+#include "StackTrace/ErrorHandlers.h"
+#include "StackTrace/Utilities.h"
+
+// Replace sith std::string_view when we switch to c++17
+#include "StackTrace/string_view.h"
+
+#include <algorithm>
+#include <atomic>
+#include <csignal>
+#include <cstring>
+#include <iostream>
+#include <map>
+#include <memory>
+#include <mutex>
+#include <random>
+#include <set>
+#include <sstream>
+#include <stdexcept>
+#include <thread>
+
+
+#define perr std::cerr
+
+using StackTrace::string_view;
+
+// Detect the OS
+// clang-format off
+#if defined( WIN32 ) || defined( _WIN32 ) || defined( WIN64 ) || defined( _WIN64 ) || defined( _MSC_VER )
+    #define USE_WINDOWS
+    #define NOMINMAX
+#elif defined( __APPLE__ )
+    #define USE_MAC
+    #define USE_NM
+#elif defined( __linux ) || defined( __linux__ ) || defined( __unix ) || defined( __posix )
+    #define USE_LINUX
+    #define USE_NM
+#else
+    #error Unknown OS
+#endif
+// clang-format on
+
+
+// Include system dependent headers
+// clang-format off
+// Detect the OS and include system dependent headers
+#ifdef USE_WINDOWS
+    #include <windows.h>
+    #include <dbghelp.h>
+    #include <DbgHelp.h>
+    #include <TlHelp32.h>
+    #include <Psapi.h>
+    #include <process.h>
+    #include <stdio.h>
+    #include <tchar.h>
+    #pragma comment( lib, "version.lib" ) // for "VerQueryValue"
+#else
+    #include <dlfcn.h>
+    #include <execinfo.h>
+    #include <sched.h>
+    #include <sys/time.h>
+    #include <ctime>
+    #include <unistd.h>
+    #include <sys/syscall.h>
+#endif
+#ifdef USE_MAC
+    #include <mach-o/dyld.h>
+    #include <mach/mach.h>
+    #include <sys/sysctl.h>
+    #include <sys/types.h>
+    #define SIGRTMIN SIGUSR1
+    #define SIGRTMAX SIGUSR2
+#endif
+// clang-format on
+
+
+#ifdef __GNUC__
+#define USE_ABI
+#include <cxxabi.h>
+#endif
+
+
+#ifndef NULL_USE
+#define NULL_USE( variable )                       \
+    do {                                           \
+        if ( 0 ) {                                 \
+            auto static temp = (char *) &variable; \
+            temp++;                                \
+        }                                          \
+    } while ( 0 )
+#endif
+
+
+// Mutex for StackTrace opertions that need blocking
+static std::mutex StackTrace_mutex;
+
+
+// Helper thread
+static std::shared_ptr<std::thread> globalMonitorThread;
+
+
+// Function to replace all instances of a string with another
+static constexpr size_t replace(
+    char *str, size_t N, size_t pos, size_t len, const string_view &r ) noexcept
+{
+    size_t Nr = r.size();
+    auto tmp  = str;
+    size_t k  = pos;
+    for ( size_t i = 0; i < Nr && k < N; i++, k++ )
+        str[k] = r[i];
+    for ( size_t i = pos + len; i < N && k < N; i++, k++ )
+        str[k] = tmp[i];
+    for ( size_t m = k; m < N; m++ )
+        str[k] = 0;
+    return k;
+}
+template<std::size_t N>
+static constexpr size_t replace(
+    std::array<char, N> &str, size_t pos, size_t len, const string_view &r ) noexcept
+{
+    return replace( str.data(), N, pos, len, r );
+}
+static constexpr void strrep(
+    char *str, size_t &N, const string_view &s, const string_view &r ) noexcept
+{
+    size_t Ns  = s.size();
+    size_t pos = string_view( str, N ).find( s );
+    while ( pos != std::string::npos ) {
+        N   = replace( str, N, pos, Ns, r );
+        pos = string_view( str, N ).find( s );
+    }
+}
+
+static void cleanupFunctionName( char * );
+
+
+// Utility to strip the path from a filename
+static constexpr const char *stripPath( const char *filename ) noexcept
+{
+    const char *s = filename;
+    while ( *s ) {
+        if ( *s == 47 || *s == 92 )
+            filename = s + 1;
+        ++s;
+    }
+    return filename;
+}
+
+
+// Functions to hash strings
+constexpr uint32_t hashString( const char *s )
+{
+    uint32_t c    = 0;
+    uint32_t hash = 5381;
+    while ( ( c = *s++ ) )
+        hash = ( ( hash << 5 ) + hash ) ^ c;
+    return hash;
+}
+template<std::size_t N1, std::size_t N2>
+static constexpr uint64_t objHash(
+    const std::array<char, N1> &obj, const std::array<char, N2> &objPath )
+{
+    uint32_t v1  = hashString( obj.data() );
+    uint32_t v2  = hashString( objPath.data() );
+    uint64_t key = ( static_cast<uint64_t>( v1 ) << 32 ) + static_cast<uint64_t>( v1 ^ v2 );
+    return key;
+}
+
+
+//! Assign a string to a std::array
+template<std::size_t N2>
+static constexpr void copy( const char *in, std::array<char, N2> &out ) noexcept
+{
+    size_t N1 = strlen( in );
+    out.fill( 0 );
+    if ( N1 < N2 ) {
+        memcpy( out.data(), in, N1 );
+    } else {
+        memcpy( out.data(), in, N2 - 4 );
+        out[N2 - 4] = out[N2 - 3] = out[N2 - 2] = '.';
+    }
+}
+template<std::size_t N1, std::size_t N2>
+static constexpr void copy( const std::array<char, N1> &in, std::array<char, N2> &out ) noexcept
+{
+    out.fill( 0 );
+    if ( N1 < N2 ) {
+        memcpy( out.data(), in.data(), N1 );
+    } else {
+        memcpy( out.data(), in.data(), N2 - 4 );
+        out[N2 - 4] = out[N2 - 3] = out[N2 - 2] = '.';
+    }
+}
+template<std::size_t N2, std::size_t N3>
+static constexpr void copy(
+    const char *in, std::array<char, N2> &out, std::array<char, N3> &outPath ) noexcept
+{
+    auto ptr = stripPath( in );
+    copy( ptr, out );
+    outPath.fill( 0 );
+    if ( ptr != in ) {
+        size_t N = ptr - in - 1;
+        if ( N < N3 ) {
+            memcpy( outPath.data(), in, N );
+        } else {
+            memcpy( outPath.data(), in, N3 - 4 );
+            outPath[N3 - 4] = outPath[N3 - 3] = outPath[N3 - 2] = '.';
+        }
+    }
+}
+
+
+// Inline function to subtract two addresses returning the absolute difference
+static inline void *subtractAddress( void *a, void *b ) noexcept
+{
+    return reinterpret_cast<void *>(
+        std::abs( reinterpret_cast<int64_t>( a ) - reinterpret_cast<int64_t>( b ) ) );
+}
+
+
+#ifdef USE_WINDOWS
+static BOOL __stdcall readProcMem( HANDLE hProcess, DWORD64 qwBaseAddress, PVOID lpBuffer,
+    DWORD nSize, LPDWORD lpNumberOfBytesRead )
+{
+    SIZE_T st;
+    BOOL bRet = ReadProcessMemory( hProcess, (LPVOID) qwBaseAddress, lpBuffer, nSize, &st );
+    *lpNumberOfBytesRead = (DWORD) st;
+    return bRet;
+}
+static inline std::string getCurrentDirectory()
+{
+    char temp[1024] = { 0 };
+    GetCurrentDirectoryA( sizeof( temp ), temp );
+    return temp;
+}
+namespace StackTrace {
+BOOL GetModuleListTH32( HANDLE hProcess, DWORD pid );
+BOOL GetModuleListPSAPI( HANDLE hProcess );
+DWORD LoadModule( HANDLE hProcess, LPCSTR img, LPCSTR mod, DWORD64 baseAddr, DWORD size );
+void LoadModules();
+}; // namespace StackTrace
+#endif
+
+
+/****************************************************************************
+ *  Class to replace a std::vector with a fixed capacity                     *
+ ****************************************************************************/
+template<class TYPE, std::size_t CAPACITY>
+class staticVector final
+{
+public:
+    staticVector() : d_size( 0 ) {}
+    size_t size() const { return d_size; }
+    bool empty() const { return d_size == 0; }
+    void push_back( const TYPE &v )
+    {
+        if ( d_size < CAPACITY )
+            d_data[d_size++] = v;
+    }
+    TYPE &operator[]( size_t i ) { return d_data[i]; }
+    TYPE *begin() { return d_data; }
+    TYPE *end() { return d_data + d_size; }
+    TYPE &back() { return d_data[d_size - 1]; }
+    TYPE *data() { return d_size == 0 ? nullptr : d_data; }
+    void pop_back() { d_size = std::max<size_t>( d_size, 1 ) - 1; }
+    const TYPE *begin() const { return d_data; }
+    const TYPE *end() const { return d_data + d_size; }
+    const TYPE &back() const { return d_data[d_size - 1]; }
+    void clear() { d_size = 0; }
+    void resize( size_t N, TYPE x = TYPE() )
+    {
+        if ( N > CAPACITY )
+            throw std::logic_error( "Invalid size" );
+        for ( size_t i = d_size; i < N; i++ )
+            d_data[i] = x;
+        d_size = N;
+    }
+    void erase( const TYPE &x )
+    {
+        size_t N = 0;
+        for ( size_t i = 0; i < d_size; i++ ) {
+            if ( d_data[i] != x )
+                d_data[N++] = d_data[i];
+        }
+        d_size = N;
+    }
+    void insert( const TYPE &x )
+    {
+        if ( std::find( begin(), end(), x ) == end() ) {
+            push_back( x );
+            std::sort( begin(), end() );
+        }
+    }
+
+private:
+    size_t d_size;
+    TYPE d_data[CAPACITY];
+};
+
+
+/****************************************************************************
+ *  Utility to temporarily clear a signal in a thread-safe manner            *
+ *  If multiple threads attempt to clear a signal, then it will be cleared   *
+ *  until all threads are finished                                           *
+ ****************************************************************************/
+typedef void ( *handle_type )( int );
+static std::atomic_int reset_signal_count[128];
+static handle_type reset_signal_handler[128] = { nullptr };
+static bool initialize_reset_signal_count()
+{
+    for ( int i = 0; i < 128; i++ )
+        reset_signal_count[i].store( 0 );
+    return true;
+}
+static bool reset_signal_vars_initialize = initialize_reset_signal_count();
+static void clearSignal( int sig )
+{
+    NULL_USE( reset_signal_vars_initialize );
+    if ( reset_signal_count[sig].fetch_add( 1 ) == 0 )
+        reset_signal_handler[sig] = signal( sig, SIG_IGN );
+}
+static void resetSignal( int sig )
+{
+    if ( reset_signal_count[sig].fetch_add( -1 ) == 1 )
+        signal( sig, reset_signal_handler[sig] );
+}
+
+
+/****************************************************************************
+ *  Utility to call system command and return output                         *
+ ****************************************************************************/
+#ifdef USE_WINDOWS
+#define popen _popen
+#define pclose _pclose
+#endif
+template<class FUNCTION>
+static inline int exec3( const char *cmd, FUNCTION &fun )
+{
+    clearSignal( SIGCHLD ); // Clear child exited
+    auto pipe = popen( cmd, "r" );
+    if ( pipe == nullptr )
+        return -1;
+    while ( !feof( pipe ) ) {
+        char buffer[0x2000];
+        buffer[0] = 0;
+        auto ptr  = fgets( buffer, sizeof( buffer ), pipe );
+        NULL_USE( ptr );
+        if ( buffer[0] != 0 )
+            fun( buffer );
+    }
+    auto status = pclose( pipe );
+    int code    = WEXITSTATUS( status );
+    std::this_thread::yield(); // Allow any signals to process
+    resetSignal( SIGCHLD );    // Clear child exited
+    return code;
+}
+template<std::size_t blocKSize>
+static void exec2( const char *cmd, staticVector<std::array<char, 1024>, blocKSize> &out )
+{
+    out.clear();
+    auto fun = [&out]( const char *line ) {
+        size_t N = strlen( line );
+        size_t k = out.size();
+        out.resize( k + 1 );
+        out[k].fill( 0 );
+        memcpy( out[k].data(), line, N );
+        if ( out[k][N - 1] == '\n' )
+            out[k][N - 1] = 0;
+    };
+    exec3( cmd, fun );
+}
+std::string StackTrace::exec( const string_view &cmd, int &code )
+{
+    std::string result;
+    auto fun = [&result]( const char *line ) { result += line; };
+    code     = exec3( cmd.data(), fun );
+    return result;
+}
+
+
+/****************************************************************************
+ *  stack_info                                                               *
+ ****************************************************************************/
+static_assert( sizeof( StackTrace::stack_info ) <= 512, "Unexpected size for stack_info" );
+StackTrace::stack_info::stack_info() { clear(); }
+void StackTrace::stack_info::clear()
+{
+    line     = 0;
+    address  = nullptr;
+    address2 = nullptr;
+    object.fill( 0 );
+    objectPath.fill( 0 );
+    filename.fill( 0 );
+    filenamePath.fill( 0 );
+    function.fill( 0 );
+}
+bool StackTrace::stack_info::operator==( const StackTrace::stack_info &rhs ) const
+{
+    if ( address == rhs.address )
+        return true;
+    if ( address2 == rhs.address2 && object == rhs.object )
+        return true;
+    return false;
+}
+bool StackTrace::stack_info::operator!=( const StackTrace::stack_info &rhs ) const
+{
+    return !operator==( rhs );
+}
+int StackTrace::stack_info::getAddressWidth() const
+{
+    auto addr = reinterpret_cast<unsigned long long int>( address );
+    if ( addr <= 0xFFFF )
+        return 4;
+    if ( addr <= 0xFFFFFFFF )
+        return 8;
+    if ( addr <= 0xFFFFFFFFFFFF )
+        return 12;
+    return 16;
+}
+std::string StackTrace::stack_info::print( int w1, int w2, int w3 ) const
+{
+    char out[32 + sizeof( stack_info )];
+    print2( out, w1, w2, w3 );
+    return std::string( out );
+}
+void StackTrace::stack_info::print(
+    std::ostream &out, const std::vector<stack_info> &stack, const StackTrace::string_view &prefix )
+{
+    char buf[32 + sizeof( stack_info )];
+    for ( const auto &tmp : stack ) {
+        tmp.print2( buf, 16, 20, 32 );
+        out << prefix << buf << std::endl;
+    }
+}
+void StackTrace::stack_info::print2( char *out, int w1, int w2, int w3 ) const
+{
+    char tmp1[16], tmp2[16];
+    sprintf( tmp1, "0x%%0%illx:  ", w1 );
+    sprintf( tmp2, "%%%is  %%%is", w2, w3 );
+    size_t pos = 0;
+    pos += sprintf( &out[pos], tmp1, reinterpret_cast<unsigned long long int>( address ) );
+    pos += sprintf( &out[pos], tmp2, stripPath( object.data() ), function.data() );
+    if ( filename[0] != 0 && line > 0 ) {
+        pos += sprintf( &out[pos], "  %s:%u", stripPath( filename.data() ), line );
+    } else if ( filename[0] != 0 ) {
+        pos += sprintf( &out[pos], "  %s", stripPath( filename.data() ) );
+    } else if ( line > 0 ) {
+        pos += sprintf( &out[pos], " : %u", line );
+    }
+    NULL_USE( pos );
+}
+size_t StackTrace::stack_info::size() const { return sizeof( *this ); }
+char *StackTrace::stack_info::pack( char *ptr ) const
+{
+    memcpy( ptr, this, sizeof( *this ) );
+    return ptr + sizeof( *this );
+}
+const char *StackTrace::stack_info::unpack( const char *ptr )
+{
+    memcpy( this, ptr, sizeof( *this ) );
+    return ptr + sizeof( *this );
+}
+
+
+/****************************************************************************
+ *  multi_stack_info                                                         *
+ ****************************************************************************/
+StackTrace::multi_stack_info::multi_stack_info( const std::vector<stack_info> &rhs )
+{
+    operator=( rhs );
+}
+StackTrace::multi_stack_info &StackTrace::multi_stack_info::operator=(
+    const std::vector<stack_info> &rhs )
+{
+    clear();
+    if ( rhs.empty() )
+        return *this;
+    N     = 1;
+    stack = rhs[0];
+    if ( rhs.size() > 1 )
+        add( rhs.size() - 1, &rhs[1] );
+    return *this;
+}
+void StackTrace::multi_stack_info::clear()
+{
+    N = 0;
+    stack.clear();
+    children.clear();
+}
+template<class FUN>
+void StackTrace::multi_stack_info::print2( int Np, char *prefix, int w[3], bool c, FUN &fun ) const
+{
+    if ( stack.address != 0 ) {
+        prefix[Np] = 0;
+        char line[4096];
+        int N2 = sprintf( line, "%s[%i] ", prefix, N );
+        stack.print2( &line[N2], w[0], w[1], w[2] );
+        fun( line );
+        prefix[Np++] = c ? '|' : ' ';
+        prefix[Np++] = ' ';
+    }
+    for ( size_t i = 0; i < children.size(); i++ ) {
+        bool c2           = children.size() > 1 && i < children.size() - 1 && stack.address != 0;
+        const auto &child = children[i];
+        child.print2( Np, prefix, w, c2, fun );
+    }
+}
+std::vector<std::string> StackTrace::multi_stack_info::print( const string_view &prefix ) const
+{
+    std::vector<std::string> text;
+    int w[3] = { getAddressWidth(), getObjectWidth(), getFunctionWidth() };
+    char prefix2[1024];
+    memcpy( prefix2, prefix.data(), prefix.size() );
+    auto fun = [&text]( const char *line ) { text.push_back( line ); };
+    print2( prefix.size(), prefix2, w, false, fun );
+    return text;
+}
+void StackTrace::multi_stack_info::print( std::ostream &out, const string_view &prefix ) const
+{
+    int w[3] = { getAddressWidth(), getObjectWidth(), getFunctionWidth() };
+    char prefix2[1024];
+    memcpy( prefix2, prefix.data(), prefix.size() );
+    auto fun = [&out]( const char *line ) { out << line << std::endl; };
+    print2( prefix.size(), prefix2, w, false, fun );
+}
+std::string StackTrace::multi_stack_info::printString( const string_view &prefix ) const
+{
+    int w[3] = { getAddressWidth(), getObjectWidth(), getFunctionWidth() };
+    char prefix2[1024];
+    memcpy( prefix2, prefix.data(), prefix.size() );
+    std::string out;
+    out.reserve( 4096 );
+    auto fun = [&out]( const char *line ) {
+        out += line;
+        out += '\n';
+    };
+    print2( prefix.size(), prefix2, w, false, fun );
+    return out;
+}
+int StackTrace::multi_stack_info::getAddressWidth() const
+{
+    int w = stack.getAddressWidth();
+    for ( const auto &child : children )
+        w = std::max( w, child.getAddressWidth() );
+    return w;
+}
+int StackTrace::multi_stack_info::getObjectWidth() const
+{
+    int w = std::min<int>( stack.object.size() + 1, 20 );
+    for ( const auto &child : children )
+        w = std::max( w, child.getObjectWidth() );
+    return w;
+}
+int StackTrace::multi_stack_info::getFunctionWidth() const
+{
+    int w = std::min<int>( stack.function.size() + 1, 40 );
+    for ( const auto &child : children )
+        w = std::max( w, child.getFunctionWidth() );
+    return w;
+}
+void StackTrace::multi_stack_info::add( size_t len, const stack_info *stack )
+{
+    if ( len == 0 )
+        return;
+    const auto &s = stack[len - 1];
+    for ( auto &i : children ) {
+        if ( i.stack == s ) {
+            i.N++;
+            if ( len > 1 )
+                i.add( len - 1, stack );
+            return;
+        }
+    }
+    children.resize( children.size() + 1 );
+    children.back().N     = 1;
+    children.back().stack = s;
+    if ( len > 1 )
+        children.back().add( len - 1, stack );
+}
+void StackTrace::multi_stack_info::add( const multi_stack_info &rhs )
+{
+    N += rhs.N;
+    for ( const auto &x : rhs.children ) {
+        bool found = false;
+        for ( auto &tmp : children ) {
+            if ( tmp.stack == x.stack ) {
+                found = true;
+                tmp.add( x );
+            }
+        }
+        if ( !found )
+            children.push_back( x );
+    }
+}
+size_t StackTrace::multi_stack_info::size() const
+{
+    size_t bytes = 2 * sizeof( int ) + stack.size();
+    for ( const auto &tmp : children )
+        bytes += tmp.size();
+    return bytes;
+}
+char *StackTrace::multi_stack_info::pack( char *ptr ) const
+{
+    int N2 = N;
+    memcpy( ptr, &N2, sizeof( int ) );
+    ptr += sizeof( int );
+    ptr    = stack.pack( ptr );
+    int Nc = children.size();
+    memcpy( ptr, &Nc, sizeof( int ) );
+    ptr += sizeof( int );
+    for ( const auto &tmp : children )
+        ptr = tmp.pack( ptr );
+    return ptr;
+}
+const char *StackTrace::multi_stack_info::unpack( const char *ptr )
+{
+    int N2, Nc;
+    memcpy( &N2, ptr, sizeof( int ) );
+    ptr += sizeof( int );
+    N   = N2;
+    ptr = stack.unpack( ptr );
+    memcpy( &Nc, ptr, sizeof( int ) );
+    ptr += sizeof( int );
+    children.resize( Nc );
+    for ( auto &tmp : children )
+        ptr = tmp.unpack( ptr );
+    return ptr;
+}
+
+
+/****************************************************************************
+ *  Function to get the executable name                                      *
+ ****************************************************************************/
+static std::array<char, 1000> getExecutableName()
+{
+    std::array<char, 1000> exe;
+    try {
+#ifdef USE_LINUX
+        char buf[0x10000] = { 0 };
+        int len           = ::readlink( "/proc/self/exe", buf, 0x10000 );
+        if ( len != -1 ) {
+            buf[len] = '\0';
+            strcpy( exe.data(), buf );
+        }
+#elif defined( USE_MAC )
+        uint32_t size     = 0x10000;
+        char buf[0x10000] = { 0 };
+        if ( _NSGetExecutablePath( buf, &size ) == 0 )
+            strcpy( exe.data(), buf );
+#elif defined( USE_WINDOWS )
+        DWORD size        = 0x10000;
+        char buf[0x10000] = { 0 };
+        GetModuleFileName( nullptr, buf, size );
+        strcpy( exe.data(), buf );
+#endif
+    } catch ( ... ) {
+    }
+    return exe;
+}
+static const char *getExecutable2()
+{
+    static auto execname = getExecutableName();
+    return execname.data();
+}
+std::string StackTrace::getExecutable() { return std::string( getExecutable2() ); }
+
+
+/****************************************************************************
+ * Function to get symbols for the executable from nm (if availible)         *
+ * Note: this function maintains an internal cached copy to prevent          *
+ *    exccessive calls to nm.  This function also uses a lock to ensure      *
+ *    thread safety.                                                         *
+ ****************************************************************************/
+static_assert( sizeof( StackTrace::symbols_struct ) <= 128, "Unexpected size for symbols_struct" );
+std::vector<StackTrace::symbols_struct> global_symbols_data;
+static bool global_symbols_loaded = false;
+static std::vector<StackTrace::symbols_struct> getSymbolData()
+{
+    std::vector<StackTrace::symbols_struct> data;
+#ifdef USE_NM
+    try {
+        char cmd[1024];
+#ifdef USE_LINUX
+        sprintf( cmd, "nm -n --demangle %s", getExecutable2() );
+#elif defined( USE_MAC )
+        sprintf( cmd, "nm -n %s | c++filt", getExecutable2() );
+#else
+#error Unknown OS using nm
+#endif
+        // Function to process a line of nm output
+        auto fun = [&data]( char *line ) {
+            if ( line[0] == ' ' )
+                return;
+            auto *a = line;
+            char *b = strchr( a, ' ' );
+            if ( b == nullptr )
+                return;
+            b[0] = 0;
+            b++;
+            char *c = strchr( b, ' ' );
+            if ( c == nullptr )
+                return;
+            c[0] = 0;
+            c++;
+            char *d = strchr( c, '\n' );
+            if ( d )
+                d[0] = 0;
+            size_t add = strtoul( a, nullptr, 16 );
+            size_t k   = data.size();
+            data.resize( k + 1 );
+            data[k].address = reinterpret_cast<void *>( add );
+            data[k].type    = b[0];
+            copy( c, data[k].obj, data[k].objPath );
+        };
+        // Call nm
+        exec3( cmd, fun );
+    } catch ( ... ) {
+    }
+#endif
+    return data;
+}
+std::vector<StackTrace::symbols_struct> StackTrace::getSymbols()
+{
+    StackTrace_mutex.lock();
+    if ( !global_symbols_loaded ) {
+        global_symbols_data   = getSymbolData();
+        global_symbols_loaded = true;
+    }
+    auto data = global_symbols_data;
+    StackTrace_mutex.unlock();
+    return data;
+}
+void StackTrace::clearSymbols()
+{
+    StackTrace_mutex.lock();
+    if ( global_symbols_loaded ) {
+        global_symbols_data   = std::vector<StackTrace::symbols_struct>();
+        global_symbols_loaded = false;
+    }
+    StackTrace_mutex.unlock();
+}
+
+
+/****************************************************************************
+ *  Function to get call stack info                                          *
+ ****************************************************************************/
+#ifdef USE_MAC
+static void *loadAddress( const uint32_t &obj_hash )
+{
+    static std::map<uint32_t, void *> obj_map;
+    if ( obj_map.empty() ) {
+        uint32_t numImages = _dyld_image_count();
+        for ( uint32_t i = 0; i < numImages; i++ ) {
+            auto header  = _dyld_get_image_header( i );
+            auto name    = _dyld_get_image_name( i );
+            auto p       = strrchr( name, '/' );
+            auto address = const_cast<struct mach_header *>( header );
+            auto hash    = hashString( p + 1 );
+            obj_map.insert( std::make_pair( hash, address ) );
+        }
+    }
+    auto it       = obj_map.find( obj_hash );
+    void *address = 0;
+    if ( it != obj_map.end() ) {
+        address = it->second;
+    } else {
+        it = obj_map.find( obj_hash );
+        if ( it != obj_map.end() )
+            address = it->second;
+    }
+    return address;
+}
+static auto split_atos( const std::string &buf )
+{
+    int line = 0;
+    std::array<char, 2048> fun;
+    std::array<char, 64> obj, file, objPath, filePath;
+    if ( buf.empty() )
+        return std::tie( fun, obj, objPath, file, filePath, line );
+    // Get the function
+    size_t index = buf.find( " (in " );
+    if ( index == std::string::npos ) {
+        copy( buf.c_str(), fun );
+        cleanupFunctionName( fun );
+        return std::tie( fun, obj, objPath, file, filePath, line );
+    }
+    copy( buf.substr( 0, index ).c_str(), fun );
+    cleanupFunctionName( fun );
+    std::string tmp = buf.substr( index + 5 );
+    // Get the object
+    index = tmp.find( ')' );
+    copy( tmp.substr( 0, index ).c_str(), obj, objPath );
+    tmp = tmp.substr( index + 1 );
+    // Get the filename and line number
+    size_t p1 = tmp.find( '(' );
+    size_t p2 = tmp.find( ')' );
+    tmp       = tmp.substr( p1 + 1, p2 - p1 - 1 );
+    index     = tmp.find( ':' );
+    if ( index != std::string::npos ) {
+        copy( tmp.substr( 0, index ).c_str(), file, filePath );
+        line = std::stoi( tmp.substr( index + 1 ) );
+    } else if ( p1 != std::string::npos ) {
+        copy( tmp.c_str(), file, filePath );
+    }
+    return std::tie( fun, obj, objPath, file, filePath, line );
+}
+#endif
+// clang-format off
+template<std::size_t blockSize>
+static void getFileAndLineObject( staticVector<StackTrace::stack_info*,blockSize> &info )
+{
+    if ( info.empty() )
+        return;
+    // This gets the file and line numbers for multiple stack lines in the same object
+    #if defined( USE_LINUX )
+        // Create the call command
+        uint32_t N;
+        char cmd[4096];
+        static_assert( sizeof(unsigned long) == sizeof(size_t), "Unxpected size for ul" );
+        if ( info[0]->objectPath[0] == 0 )
+            N = sprintf(cmd,"addr2line -C -e %s -f",info[0]->object.data());
+        else
+            N = sprintf(cmd,"addr2line -C -e %s/%s -f",info[0]->objectPath.data(),info[0]->object.data());
+        for (size_t i=0; i<info.size() && N < sizeof(cmd) - 32; i++) {
+            N += sprintf(&cmd[N]," %lx %lx",
+                reinterpret_cast<unsigned long>( info[i]->address ),
+                reinterpret_cast<unsigned long>( info[i]->address2 ) );
+        }
+        N += sprintf(&cmd[N]," 2> /dev/null");
+        // Get the function/line/file
+        staticVector<std::array<char, 1024>,4*blockSize> output;
+        exec2( cmd, output );
+        if ( output.size() != 4*info.size() )
+            return;
+        // Add the results to info
+        for (size_t i=0; i<info.size(); i++) {
+            char *tmp1 = output[4*i+0].data();
+            char *tmp2 = output[4*i+1].data();
+            if ( tmp1[0] == '?' && tmp1[1] == '?' ) {
+                tmp1 = output[4*i+2].data();
+                tmp2 = output[4*i+3].data();
+            }
+            if ( tmp1[0] == '?' && tmp1[1] == '?' ) {
+                continue;
+            }
+            // get function name
+            if ( info[i]->function.empty() ) {
+                cleanupFunctionName( tmp1 );
+                copy( tmp1, info[i]->function );
+            }
+            // get file and line
+            char *buf = tmp2;
+            if ( buf[0] != '?' && buf[0] != 0 ) {
+                size_t j = 0;
+                for ( j = 0; j < 4095 && buf[j] != ':'; j++ ) {
+                }
+                buf[j] = 0;
+                copy( buf, info[i]->filename, info[i]->filenamePath );
+                info[i]->line = atoi( &buf[j + 1] );
+            }
+        }
+    #elif defined( USE_MAC ) 
+        // Create the call command
+        void* load_address = loadAddress( hashString( info[0]->object.data() ) );
+        if ( load_address == nullptr )
+            return;
+        // Call atos to get the object info
+        uint32_t N;
+        char cmd[4096];
+        static_assert( sizeof(unsigned long) == sizeof(size_t), "Unxpected size for ul" );
+        auto addr = reinterpret_cast<unsigned long>( load_address );
+        if ( info[0]->objectPath[0] == 0 )
+            N = sprintf( cmd, "atos -o %s -f -l %lx", info[0]->object.data(), addr );
+        else
+            N = sprintf( cmd, "atos -o %s/%s -f -l %lx", info[0]->objectPath.data(), info[0]->object.data(), addr );
+        for (size_t i=0; i<info.size() && N < sizeof(cmd) - 32; i++)
+            N += sprintf( &cmd[N], " %lx", reinterpret_cast<unsigned long>( info[i]->address ) );
+        N += sprintf(&cmd[N]," 2> /dev/null");
+        // Get the function/line/file
+        staticVector<std::array<char, 1024>,blockSize> output;
+        exec2( cmd, output );
+        if ( output.size() != info.size() )
+            return;
+        // Parse the output for function, file and line info
+        for ( size_t i=0; i<info.size(); i++) {
+            auto data = split_atos( output[2*i].data() );
+            if ( info[i]->function.empty() )
+                info[i]->function = std::get<0>(data);
+            if ( info[i]->object.empty() ) {
+                info[i]->object = std::get<1>(data);
+                info[i]->objectPath = std::get<2>(data);
+            }
+            if ( info[i]->filename.empty() ) {
+                info[i]->filename = std::get<3>(data);
+                info[i]->filenamePath = std::get<4>(data);
+            }
+            if ( info[i]->line==0 )
+                info[i]->line = std::get<5>(data);
+        }
+    #endif
+}
+static void getFileAndLine( size_t N, StackTrace::stack_info *info )
+{
+    constexpr size_t blockSize = 1024;
+    // Operate on blocks
+    size_t i0 = 0;
+    while ( i0 < N ) {
+        // Get a list of objects
+        staticVector<uint64_t,blockSize> objectHash;
+        for ( size_t i = i0; i<N && i-i0 < blockSize; i++)
+            objectHash.insert( objHash( info[i].object, info[i].objectPath ) );
+        // For each object, get the file/line numbers for all entries
+        for ( const auto & hash : objectHash ) {
+            staticVector<StackTrace::stack_info*,blockSize> list;
+            for ( size_t i = i0; i<N && i-i0 < blockSize; i++) {
+                if ( objHash( info[i].object, info[i].objectPath ) == hash )
+                    list.push_back( &info[i] );
+            }
+            getFileAndLineObject( list );
+        }
+        i0 = std::min( N, i0 + blockSize );
+    }
+}
+// Try to use the global symbols to decode info about the stack
+static void getDataFromGlobalSymbols( StackTrace::stack_info &info )
+{
+    if ( !global_symbols_loaded ) {
+        global_symbols_data   = getSymbolData();
+        global_symbols_loaded = true;
+    }
+    const auto &data = global_symbols_data;
+    if ( !data.empty() ) {
+        // Find the closest address
+        size_t lower = 0;
+        size_t upper = data.size() - 1;
+        while ( ( upper - lower ) != 1 ) {
+            size_t value = ( upper + lower ) / 2;
+            if ( data[value].address >= info.address )
+                upper = value;
+            else
+                lower = value;
+        }
+        if ( upper > 0 ) {
+            copy( data[lower].obj, info.object );
+            copy( data[lower].objPath, info.objectPath );
+        } else {
+            copy( getExecutable2(), info.object, info.objectPath );
+        }
+    }
+}
+static void signal_handler( int sig )
+{
+    printf("Signal caught acquiring stack (%i)\n",sig);
+    StackTrace::setErrorHandler( [](const StackTrace::abort_error &err) { std::cerr << err.what(); exit( -1 ); } );
+}
+static void getStackInfo2( size_t N, void* const* address, StackTrace::stack_info *info )
+{
+    // Temporarily handle signals to prevent recursion on the stack
+    auto prev_handler = signal( SIGINT, signal_handler );
+    // Get the detailed stack info
+    try {
+        #ifdef USE_WINDOWS
+            IMAGEHLP_SYMBOL64 pSym[1024];
+            memset( pSym, 0, sizeof( pSym ) );
+            pSym->SizeOfStruct  = sizeof( IMAGEHLP_SYMBOL64 );
+            pSym->MaxNameLength = 1024;
+
+            IMAGEHLP_MODULE64 Module;
+            memset( &Module, 0, sizeof( Module ) );
+            Module.SizeOfStruct = sizeof( Module );
+
+            HANDLE pid = GetCurrentProcess();
+
+            for (size_t i=0; i<N; i++) {
+                info[i].address = address[i];
+                DWORD64 address2 = reinterpret_cast<DWORD64>( address[i] );
+                DWORD64 offsetFromSymbol;
+                if ( SymGetSymFromAddr( pid, address2, &offsetFromSymbol, pSym ) != FALSE ) {
+                    char name[8192]={0};
+                    DWORD rtn = UnDecorateSymbolName( pSym->Name, name, sizeof(name)-1, UNDNAME_COMPLETE );
+                    if ( rtn == 0 ) {
+                        cleanupFunctionName( pSym->Name );
+                        copy( pSym->Name, info[i].function );
+                    } else {
+                        info[i].function.fill( 0 );
+                    }
+                } else {
+                    printf( "ERROR: SymGetSymFromAddr (%d,%p)\n", GetLastError(), address2 );
+                }
+
+                // Get line number
+                IMAGEHLP_LINE64 Line;
+                memset( &Line, 0, sizeof( Line ) );
+                Line.SizeOfStruct = sizeof( Line );
+                DWORD offsetFromLine;
+                if ( SymGetLineFromAddr64( pid, address2, &offsetFromLine, &Line ) != FALSE ) {
+                    info[i].line     = Line.LineNumber;
+                    copy( Line.FileName, info[i].filename, info[i].filenamePath );
+                } else {
+                    info[i].line     = 0;
+                    copy( nullptr, info[i].filename, info[i].filenamePath );
+                }
+
+                // Get the object
+                if ( SymGetModuleInfo64( pid, address2, &Module ) != FALSE ) {
+                    copy( Module.LoadedImageName, info[i].object, info[i].objectPath );
+                }
+            }
+        #else
+            for (size_t i=0; i<N; i++) {
+                info[i].address = address[i];
+                #if defined(_GNU_SOURCE) || defined(USE_MAC)
+                    Dl_info dlinfo;
+                    if ( !dladdr( info[i].address, &dlinfo ) ) {
+                        getDataFromGlobalSymbols( info[i] );
+                        continue;
+                    }
+                    info[i].address2 = subtractAddress( info[i].address, dlinfo.dli_fbase );
+                    copy( dlinfo.dli_fname, info[i].object, info[i].objectPath );
+                    #if defined( USE_ABI )
+                        int status;
+                        char *demangled = abi::__cxa_demangle( dlinfo.dli_sname, nullptr, nullptr, &status );
+                        if ( status == 0 && demangled != nullptr ) {
+                            cleanupFunctionName( demangled );
+                            copy( demangled, info[i].function );
+                        } else if ( dlinfo.dli_sname != nullptr ) {
+                            copy( dlinfo.dli_sname, info[i].function );
+                        }
+                        free( demangled );
+                    #endif
+                    if ( dlinfo.dli_sname != nullptr && info[i].function[0] == 0 ) {
+                        std::array<char,4096> tmp;
+                        copy( dlinfo.dli_sname, tmp );
+                        cleanupFunctionName( tmp.data() );
+                        copy( tmp, info[i].function );
+                    }
+                #else
+                    getDataFromGlobalSymbols( info[i] );
+                #endif
+            }
+            // Get the filename / line numbers for each item on the stack
+            getFileAndLine( N, info );
+        #endif
+    } catch ( ... ) {
+    }
+    signal( SIGINT, prev_handler ) ;
+}
+StackTrace::stack_info StackTrace::getStackInfo( void *address )
+{
+    StackTrace::stack_info info;
+    getStackInfo2( 1, &address, &info );
+    return info;
+}
+std::vector<StackTrace::stack_info> StackTrace::getStackInfo( const std::vector<void*>& address )
+{
+    std::vector<StackTrace::stack_info> info( address.size() );
+    getStackInfo2( address.size(), address.data(), info.data() );
+    return info;
+}
+
+
+/****************************************************************************
+*  Helper functions for controlling interal signals                         *
+****************************************************************************/
+static int backtrace_thread( const std::thread::native_handle_type&, void**, size_t );
+#if defined( USE_LINUX ) || defined( USE_MAC )
+static int global_thread_backtrace_count;
+static void* global_thread_backtrace[1000];
+static void _callstack_signal_handler( int, siginfo_t*, void* )
+{
+    global_thread_backtrace_count = backtrace_thread( StackTrace::thisThread(), global_thread_backtrace, 1000 );
+}
+static int get_thread_callstack_signal()
+{
+    if ( 39 >= SIGRTMIN && 39 <= SIGRTMAX )
+        return 39;
+    return std::min<int>( SIGRTMIN+4, SIGRTMAX );
+}
+static int thread_callstack_signal = get_thread_callstack_signal();
+#endif
+
+
+/****************************************************************************
+*  Function to get the list of all active threads                           *
+****************************************************************************/
+#if defined( USE_LINUX ) || defined( USE_MAC )
+static std::thread::native_handle_type thread_handle;
+static bool thread_id_finished;
+static void _activeThreads_signal_handler( int )
+{
+    auto handle = StackTrace::thisThread( );
+    thread_handle = handle;
+    thread_id_finished = true;
+}
+#endif
+#ifdef USE_LINUX
+static constexpr int get_tid( int pid, const char *line )
+{
+    char buf2[128]={0};
+    int i1 = 0;
+    while ( line[i1]==' ' ) { i1++; }
+    int i2 = i1;
+    while ( line[i2]!=' ' ) { i2++; }
+    memcpy(buf2,&line[i1],i2-i1);
+    buf2[i2-i1+1] = 0;
+    int pid2 = atoi(buf2);
+    if ( pid2 != pid )
+        return -1;
+    i1 = i2;
+    while ( line[i1]==' ' ) { i1++; }
+    i2 = i1;
+    while ( line[i2]!=' ' ) { i2++; }
+    memcpy(buf2,&line[i1],i2-i1);
+    buf2[i2-i1+1] = 0;
+    int tid = atoi(buf2);
+    return tid;
+}
+#endif
+std::thread::native_handle_type StackTrace::thisThread( )
+{
+    #if defined( USE_LINUX ) || defined( USE_MAC )
+        return pthread_self();
+    #elif defined( USE_WINDOWS )
+        return GetCurrentThread();
+    #else
+        #warning Stack trace is not supported on this compiler/OS
+        return std::thread::native_handle_type();
+    #endif
+}
+static staticVector<std::thread::native_handle_type,1024> getActiveThreads( )
+{
+    staticVector<std::thread::native_handle_type,1024> threads;
+    #if defined( USE_LINUX )
+        int N_tid = 0, tid[1024];
+        int pid = getpid();
+        char cmd[128];
+        sprintf( cmd, "ps -T -p %i", pid );
+        auto fun = [&N_tid,&tid,pid]( const char* line ) {
+            int id = get_tid( pid, line );
+            if ( id != -1 && N_tid < 1024 )
+                tid[N_tid++] = id;
+        };
+        exec3( cmd, fun );
+        int myid = syscall(SYS_gettid);
+        for ( int i=0; i<N_tid; i++) {
+            if ( tid[i] == myid )
+                std::swap( tid[i], tid[--N_tid] );
+        }
+        auto old = signal( thread_callstack_signal, _activeThreads_signal_handler );
+        for ( int i=0; i<N_tid; i++) {
+            StackTrace_mutex.lock();
+            thread_id_finished = false;
+            thread_handle = StackTrace::thisThread();
+            syscall( SYS_tgkill, pid, tid[i], thread_callstack_signal );
+            auto t1 = std::chrono::high_resolution_clock::now();
+            auto t2 = std::chrono::high_resolution_clock::now();
+            while ( !thread_id_finished && std::chrono::duration<double>(t2-t1).count()<0.1 ) {
+                std::this_thread::yield();
+                t2 = std::chrono::high_resolution_clock::now();
+            }
+            threads.push_back( thread_handle );
+            StackTrace_mutex.unlock();
+        }
+        signal( thread_callstack_signal, old );
+    #elif defined( USE_MAC )
+        thread_act_port_array_t thread_list;
+        mach_msg_type_number_t thread_count = 0;
+        task_threads(mach_task_self(), &thread_list, &thread_count);
+        auto old = signal( thread_callstack_signal, _activeThreads_signal_handler );
+        for ( int i=0; i<thread_count; i++) {
+            if ( thread_list[i] == mach_thread_self() )
+                continue;
+            static bool called = false;
+            if ( !called ) {
+                called = true;
+                std::cerr << "activeThreads not finished for MAC\n";
+            }
+            /*
+            StackTrace_mutex.lock();
+            thread_id_finished = false;
+            thread_handle = thisThread();
+            x86_thread_state64_t state;
+            unsigned int count = MACHINE_THREAD_STATE_COUNT;
+            thread_abort( thread_list[i] );  // Abort system calls
+            thread_suspend( thread_list[i] );
+            thread_get_state( thread_list[i], MACHINE_THREAD_STATE, (thread_state_t) &state, &count );
+            state.__rip = (uint64_t) _activeThreads2;
+            thread_set_state( thread_list[i], MACHINE_THREAD_STATE, (thread_state_t) &state, MACHINE_THREAD_STATE_COUNT );
+            thread_resume( thread_list[i] );
+            //pthread_kill( thread_list[i], CALLSTACK_SIG );
+            //syscall( SYS___pthread_kill, getpid(), thread_list[i], CALLSTACK_SIG );
+            //syscall( SYS_kill, thread_list[i], CALLSTACK_SIG );
+            auto t1 = std::chrono::high_resolution_clock::now();
+            auto t2 = std::chrono::high_resolution_clock::now();
+            while ( !thread_id_finished && std::chrono::duration<double>(t2-t1).count()<0.1 ) {
+                std::this_thread::yield();
+                t2 = std::chrono::high_resolution_clock::now();
+            }
+            threads.push_back( thread_handle );
+            StackTrace_mutex.unlock();*/
+        }
+        signal( thread_callstack_signal, old );
+    #elif defined( USE_WINDOWS )
+        HANDLE hThreadSnap = CreateToolhelp32Snapshot( TH32CS_SNAPTHREAD, 0 ); 
+        if( hThreadSnap != INVALID_HANDLE_VALUE ) {
+            // Fill in the size of the structure before using it
+            THREADENTRY32 te32
+            te32.dwSize = sizeof(THREADENTRY32 );
+            // Retrieve information about the first thread, and exit if unsuccessful
+            if( !Thread32First( hThreadSnap, &te32 ) ) {
+                printError( TEXT("Thread32First") );    // Show cause of failure
+                CloseHandle( hThreadSnap );             // Must clean up the snapshot object!
+                return( FALSE );
+            }
+            // Now walk the thread list of the system
+            do { 
+                if ( te32.th32OwnerProcessID == dwOwnerPID )
+                    threads.push_back( te32.th32ThreadID );
+            } while( Thread32Next(hThreadSnap, &te32 ) );
+            CloseHandle( hThreadSnap );                 // Must clean up the snapshot object!
+        }
+    #else
+        #warning activeThreads is not yet supported on this compiler/OS
+    #endif
+    // Add the current thread
+    threads.push_back( StackTrace::thisThread() );
+    // Remove the globalMonitorThread
+    if ( globalMonitorThread ) {
+        auto globalThreadId = globalMonitorThread->native_handle();
+        for ( int i = threads.size() - 1; i >= 0; i-- ) {
+            if ( threads[i] == globalThreadId ) {
+                std::swap( threads[i], threads.back() );
+                threads.pop_back();
+            }
+        }
+    }
+    // Sort the threads, remove any duplicates and remove the globalMonitorThread
+    std::sort( threads.begin(), threads.end() );
+    return threads;
+}
+// clang-format on
+std::vector<std::thread::native_handle_type> StackTrace::activeThreads()
+{
+    auto threads = getActiveThreads();
+    std::sort( threads.begin(), threads.end() );
+    return std::vector<std::thread::native_handle_type>( threads.begin(), threads.end() );
+}
+
+
+/****************************************************************************
+ *  Function to get the backtrace                                            *
+ ****************************************************************************/
+static int backtrace_thread(
+    const std::thread::native_handle_type &tid, void **buffer, size_t size )
+{
+    int count = 0;
+#if defined( USE_LINUX ) || defined( USE_MAC )
+    // Get the trace
+    if ( tid == pthread_self() ) {
+        count = ::backtrace( buffer, size );
+    } else {
+        // Note: this will get the backtrace, but terminates the thread in the process!!!
+        StackTrace_mutex.lock();
+        struct sigaction sa;
+        sigfillset( &sa.sa_mask );
+        sa.sa_flags     = SA_SIGINFO;
+        sa.sa_sigaction = _callstack_signal_handler;
+        sigaction( thread_callstack_signal, &sa, nullptr );
+        global_thread_backtrace_count = -1;
+        pthread_kill( tid, thread_callstack_signal );
+        auto t1 = std::chrono::high_resolution_clock::now();
+        auto t2 = std::chrono::high_resolution_clock::now();
+        while ( global_thread_backtrace_count == -1 &&
+                std::chrono::duration<double>( t2 - t1 ).count() < 0.15 ) {
+            std::this_thread::yield();
+            t2 = std::chrono::high_resolution_clock::now();
+        }
+        count = std::max( global_thread_backtrace_count, 0 );
+        memcpy( buffer, global_thread_backtrace, count * sizeof( void * ) );
+        global_thread_backtrace_count = -1;
+        StackTrace_mutex.unlock();
+    }
+#elif defined( USE_WINDOWS )
+#if defined( DBGHELP )
+
+    // Load the modules for the stack trace
+    LoadModules();
+
+    // Initialize stackframe for first call
+    ::CONTEXT context;
+    memset( &context, 0, sizeof( context ) );
+    context.ContextFlags = CONTEXT_FULL;
+    RtlCaptureContext( &context );
+    STACKFRAME64 frame; // in/out stackframe
+    memset( &frame, 0, sizeof( frame ) );
+#ifdef _M_IX86
+    DWORD imageType = IMAGE_FILE_MACHINE_I386;
+    frame.AddrPC.Offset = context.Eip;
+    frame.AddrPC.Mode = AddrModeFlat;
+    frame.AddrFrame.Offset = context.Ebp;
+    frame.AddrFrame.Mode = AddrModeFlat;
+    frame.AddrStack.Offset = context.Esp;
+    frame.AddrStack.Mode = AddrModeFlat;
+#elif _M_X64
+    DWORD imageType        = IMAGE_FILE_MACHINE_AMD64;
+    frame.AddrPC.Offset    = context.Rip;
+    frame.AddrPC.Mode      = AddrModeFlat;
+    frame.AddrFrame.Offset = context.Rsp;
+    frame.AddrFrame.Mode   = AddrModeFlat;
+    frame.AddrStack.Offset = context.Rsp;
+    frame.AddrStack.Mode   = AddrModeFlat;
+#elif _M_IA64
+    DWORD imageType         = IMAGE_FILE_MACHINE_IA64;
+    frame.AddrPC.Offset     = context.StIIP;
+    frame.AddrPC.Mode       = AddrModeFlat;
+    frame.AddrFrame.Offset  = context.IntSp;
+    frame.AddrFrame.Mode    = AddrModeFlat;
+    frame.AddrBStore.Offset = context.RsBSP;
+    frame.AddrBStore.Mode   = AddrModeFlat;
+    frame.AddrStack.Offset  = context.IntSp;
+    frame.AddrStack.Mode    = AddrModeFlat;
+#else
+#error "Platform not supported!"
+#endif
+
+    auto pid = GetCurrentProcess();
+    for ( int frameNum = 0; frameNum < 1024; ++frameNum ) {
+        BOOL rtn = StackWalk64( imageType, pid, tid, &frame, &context, readProcMem,
+            SymFunctionTableAccess, SymGetModuleBase64, NULL );
+        if ( !rtn ) {
+            printf( "ERROR: StackWalk64 (%p)\n", frame.AddrPC.Offset );
+            break;
+        }
+        if ( frame.AddrPC.Offset != 0 ) {
+                    buffer[count] = reinterpret_cast<void*>( frame.AddrPC.Offset ) );
+                    count++;
+        }
+        if ( frame.AddrReturn.Offset == 0 )
+            break;
+    }
+    SetLastError( ERROR_SUCCESS );
+#endif
+#else
+#warning Stack trace is not supported on this compiler/OS
+#endif
+    return count;
+}
+std::vector<void *> StackTrace::backtrace( std::thread::native_handle_type tid )
+{
+    std::vector<void *> trace( 1000, nullptr );
+    size_t count = backtrace_thread( tid, trace.data(), trace.size() );
+    trace.resize( count );
+    return trace;
+}
+std::vector<void *> StackTrace::backtrace()
+{
+    std::vector<void *> trace( 1000, nullptr );
+    size_t count = backtrace_thread( thisThread(), trace.data(), trace.size() );
+    trace.resize( count );
+    return trace;
+}
+std::vector<std::vector<void *>> StackTrace::backtraceAll()
+{
+    // Get the list of threads
+    auto threads = getActiveThreads();
+    // Get the backtrace of each thread
+    std::vector<std::vector<void *>> trace( threads.size() );
+    for ( size_t i = 0; i < threads.size(); i++ ) {
+        trace[i].resize( 1000 );
+        size_t count = backtrace_thread( threads[i], trace[i].data(), trace[i].size() );
+        trace[i].resize( count );
+    }
+    return trace;
+}
+
+
+/****************************************************************************
+ *  Function to get the current call stack                                   *
+ ****************************************************************************/
+std::vector<StackTrace::stack_info> StackTrace::getCallStack()
+{
+    void *trace[1000];
+    size_t count = backtrace_thread( thisThread(), trace, 1000 );
+    std::vector<StackTrace::stack_info> info( count );
+    getStackInfo2( count, trace, info.data() );
+    return info;
+}
+std::vector<StackTrace::stack_info> StackTrace::getCallStack( std::thread::native_handle_type id )
+{
+    void *trace[1000];
+    size_t count = backtrace_thread( id, trace, 1000 );
+    std::vector<StackTrace::stack_info> info( count );
+    getStackInfo2( count, trace, info.data() );
+    return info;
+}
+static std::vector<std::vector<StackTrace::stack_info>> generateStacks(
+    const std::vector<std::vector<void *>> &trace )
+{
+    // Function to find an address
+    auto find = []( const auto &data, auto x ) {
+        for ( size_t i = 0; i < data.size(); i++ ) {
+            if ( data[i] == x )
+                return static_cast<int>( i );
+        }
+        return -1;
+    };
+    // Get the stack data for all pointers
+    std::vector<void *> addresses;
+    addresses.reserve( 1024 );
+    for ( const auto &tmp : trace ) {
+        for ( auto ptr : tmp ) {
+            if ( find( addresses, ptr ) == -1 )
+                addresses.push_back( ptr );
+        }
+    }
+    auto stack_data = StackTrace::getStackInfo( addresses );
+    // Create the stack traces
+    std::vector<std::vector<StackTrace::stack_info>> stack( trace.size() );
+    for ( size_t i = 0; i < trace.size(); i++ ) {
+        // Create the stack for the given thread trace
+        stack[i].resize( trace[i].size() );
+        for ( size_t j = 0; j < trace[i].size(); j++ ) {
+            int k       = find( addresses, trace[i][j] );
+            stack[i][j] = stack_data[k];
+        }
+    }
+    return stack;
+}
+static StackTrace::multi_stack_info generateMultiStack(
+    const std::vector<std::vector<void *>> &trace )
+{
+    // Get the stack data for all pointers
+    auto stack = generateStacks( trace );
+    // Create the multi-stack trace
+    StackTrace::multi_stack_info multistack;
+    multistack.N = stack.size();
+    for ( const auto &tmp : stack )
+        multistack.add( tmp.size(), tmp.data() );
+    return multistack;
+}
+static StackTrace::multi_stack_info generateMultiStack(
+    const staticVector<std::thread::native_handle_type, 1024> &threads )
+{
+    // Get the stack data for all pointers
+    std::vector<std::vector<void *>> trace( threads.size() );
+    auto it = threads.begin();
+    for ( size_t i = 0; i < threads.size(); i++, ++it )
+        trace[i] = StackTrace::backtrace( *it );
+    // Create the multi-stack trace
+    return generateMultiStack( trace );
+}
+StackTrace::multi_stack_info StackTrace::getAllCallStacks()
+{
+    // Get the list of active thread
+    auto threads = getActiveThreads();
+    // Create the multi-stack strucutre
+    auto stack = generateMultiStack( threads );
+    return stack;
+}
+
+
+/****************************************************************************
+ *  Function to get system search paths                                      *
+ ****************************************************************************/
+std::string StackTrace::getSymPaths()
+{
+    std::string paths;
+#ifdef USE_WINDOWS
+    // Create the path list (seperated by ';' )
+    paths = std::string( ".;" );
+    paths.reserve( 1000 );
+    // Add the current directory
+    paths += getCurrentDirectory() + ";";
+    // Now add the path for the main-module:
+    char temp[1024];
+    memset( temp, 0, sizeof( temp ) );
+    if ( GetModuleFileNameA( nullptr, temp, sizeof( temp ) - 1 ) > 0 ) {
+        for ( char *p = ( temp + strlen( temp ) - 1 ); p >= temp; --p ) {
+            // locate the rightmost path separator
+            if ( ( *p == '\\' ) || ( *p == '/' ) || ( *p == ':' ) ) {
+                *p = 0;
+                break;
+            }
+        }
+        if ( strlen( temp ) > 0 ) {
+            paths += temp;
+            paths += ";";
+        }
+    }
+    memset( temp, 0, sizeof( temp ) );
+    if ( GetEnvironmentVariableA( "_NT_SYMBOL_PATH", temp, sizeof( temp ) - 1 ) > 0 ) {
+        paths += temp;
+        paths += ";";
+    }
+    memset( temp, 0, sizeof( temp ) );
+    if ( GetEnvironmentVariableA( "_NT_ALTERNATE_SYMBOL_PATH", temp, sizeof( temp ) - 1 ) > 0 ) {
+        paths += temp;
+        paths += ";";
+    }
+    memset( temp, 0, sizeof( temp ) );
+    if ( GetEnvironmentVariableA( "SYSTEMROOT", temp, sizeof( temp ) - 1 ) > 0 ) {
+        paths += temp;
+        paths += ";";
+        // also add the "system32"-directory:
+        paths += temp;
+        paths += "\\system32;";
+    }
+    memset( temp, 0, sizeof( temp ) );
+    if ( GetEnvironmentVariableA( "SYSTEMDRIVE", temp, sizeof( temp ) - 1 ) > 0 ) {
+        paths += "SRV*;" + std::string( temp ) +
+                 "\\websymbols*http://msdl.microsoft.com/download/symbols;";
+    } else {
+        paths += "SRV*c:\\websymbols*http://msdl.microsoft.com/download/symbols;";
+    }
+#endif
+    return paths;
+}
+
+
+/****************************************************************************
+ *  Load modules for windows                                                 *
+ ****************************************************************************/
+#ifdef USE_WINDOWS
+BOOL StackTrace::GetModuleListTH32( HANDLE hProcess, DWORD pid )
+{
+    // CreateToolhelp32Snapshot()
+    typedef HANDLE( __stdcall * tCT32S )( DWORD dwFlags, DWORD th32ProcessID );
+    // Module32First()
+    typedef BOOL( __stdcall * tM32F )( HANDLE hSnapshot, LPMODULEENTRY32 lpme );
+    // Module32Next()
+    typedef BOOL( __stdcall * tM32N )( HANDLE hSnapshot, LPMODULEENTRY32 lpme );
+
+    // try both dlls...
+    const TCHAR *dllname[] = { _T("kernel32.dll"), _T("tlhelp32.dll") };
+    HINSTANCE hToolhelp    = nullptr;
+    tCT32S pCT32S          = nullptr;
+    tM32F pM32F            = nullptr;
+    tM32N pM32N            = nullptr;
+
+    HANDLE hSnap;
+    MODULEENTRY32 me;
+    me.dwSize = sizeof( me );
+
+    for ( size_t i = 0; i < ( sizeof( dllname ) / sizeof( dllname[0] ) ); i++ ) {
+        hToolhelp = LoadLibrary( dllname[i] );
+        if ( hToolhelp == nullptr )
+            continue;
+        pCT32S = (tCT32S) GetProcAddress( hToolhelp, "CreateToolhelp32Snapshot" );
+        pM32F  = (tM32F) GetProcAddress( hToolhelp, "Module32First" );
+        pM32N  = (tM32N) GetProcAddress( hToolhelp, "Module32Next" );
+        if ( ( pCT32S != nullptr ) && ( pM32F != nullptr ) && ( pM32N != nullptr ) )
+            break; // found the functions!
+        FreeLibrary( hToolhelp );
+        hToolhelp = nullptr;
+    }
+
+    if ( hToolhelp == nullptr )
+        return FALSE;
+
+    hSnap = pCT32S( TH32CS_SNAPMODULE, pid );
+    if ( hSnap == (HANDLE) -1 ) {
+        FreeLibrary( hToolhelp );
+        return FALSE;
+    }
+
+    bool keepGoing = !!pM32F( hSnap, &me );
+    int cnt        = 0;
+    while ( keepGoing ) {
+        LoadModule( hProcess, me.szExePath, me.szModule, (DWORD64) me.modBaseAddr, me.modBaseSize );
+        cnt++;
+        keepGoing = !!pM32N( hSnap, &me );
+    }
+    CloseHandle( hSnap );
+    FreeLibrary( hToolhelp );
+    if ( cnt <= 0 )
+        return FALSE;
+    return TRUE;
+}
+DWORD StackTrace::LoadModule(
+    HANDLE hProcess, LPCSTR img, LPCSTR mod, DWORD64 baseAddr, DWORD size )
+{
+    CHAR *szImg  = _strdup( img );
+    CHAR *szMod  = _strdup( mod );
+    DWORD result = ERROR_SUCCESS;
+    if ( ( szImg == nullptr ) || ( szMod == nullptr ) ) {
+        result = ERROR_NOT_ENOUGH_MEMORY;
+    } else {
+        if ( SymLoadModule( hProcess, 0, szImg, szMod, baseAddr, size ) == 0 )
+            result = GetLastError();
+    }
+    ULONGLONG fileVersion = 0;
+    if ( szImg != nullptr ) {
+        // try to retrive the file-version:
+        VS_FIXEDFILEINFO *fInfo = nullptr;
+        DWORD dwHandle;
+        DWORD dwSize = GetFileVersionInfoSizeA( szImg, &dwHandle );
+        if ( dwSize > 0 ) {
+            LPVOID vData = malloc( dwSize );
+            if ( vData != nullptr ) {
+                if ( GetFileVersionInfoA( szImg, dwHandle, dwSize, vData ) != 0 ) {
+                    UINT len;
+                    TCHAR szSubBlock[] = _T("\\");
+                    if ( VerQueryValue( vData, szSubBlock, (LPVOID *) &fInfo, &len ) == 0 ) {
+                        fInfo = nullptr;
+                    } else {
+                        fileVersion = ( (ULONGLONG) fInfo->dwFileVersionLS ) +
+                                      ( (ULONGLONG) fInfo->dwFileVersionMS << 32 );
+                    }
+                }
+                free( vData );
+            }
+        }
+
+        // Retrive some additional-infos about the module
+        IMAGEHLP_MODULE64 Module;
+        Module.SizeOfStruct = sizeof( IMAGEHLP_MODULE64 );
+        SymGetModuleInfo64( hProcess, baseAddr, &Module );
+        LPCSTR pdbName = Module.LoadedImageName;
+        if ( Module.LoadedPdbName[0] != 0 )
+            pdbName = Module.LoadedPdbName;
+    }
+    if ( szImg != nullptr )
+        free( szImg );
+    if ( szMod != nullptr )
+        free( szMod );
+    return result;
+}
+BOOL StackTrace::GetModuleListPSAPI( HANDLE hProcess )
+{
+    DWORD cbNeeded;
+    HMODULE hMods[1024];
+    char tt[8192];
+    char tt2[8192];
+    if ( !EnumProcessModules( hProcess, hMods, sizeof( hMods ), &cbNeeded ) ) {
+        return false;
+    }
+    if ( cbNeeded > sizeof( hMods ) ) {
+        printf( "Insufficient memory allocated in GetModuleListPSAPI\n" );
+        return false;
+    }
+    int cnt = 0;
+    for ( DWORD i = 0; i < cbNeeded / sizeof( hMods[0] ); i++ ) {
+        // base address, size
+        MODULEINFO mi;
+        GetModuleInformation( hProcess, hMods[i], &mi, sizeof( mi ) );
+        // image file name
+        tt[0] = 0;
+        GetModuleFileNameExA( hProcess, hMods[i], tt, sizeof( tt ) );
+        // module name
+        tt2[0] = 0;
+        GetModuleBaseNameA( hProcess, hMods[i], tt2, sizeof( tt2 ) );
+        DWORD dwRes = LoadModule( hProcess, tt, tt2, (DWORD64) mi.lpBaseOfDll, mi.SizeOfImage );
+        if ( dwRes != ERROR_SUCCESS )
+            printf( "ERROR: LoadModule (%d)\n", dwRes );
+        cnt++;
+    }
+
+    return cnt != 0;
+}
+void StackTrace::LoadModules()
+{
+    static bool modules_loaded = false;
+    if ( !modules_loaded ) {
+        modules_loaded = true;
+
+        // Get the search paths for symbols
+        std::string paths = StackTrace::getSymPaths();
+
+        // Initialize the symbols
+        if ( SymInitialize( GetCurrentProcess(), paths.c_str(), FALSE ) == FALSE )
+            printf( "ERROR: SymInitialize (%d)\n", GetLastError() );
+
+        DWORD symOptions = SymGetOptions();
+        symOptions |= SYMOPT_LOAD_LINES | SYMOPT_FAIL_CRITICAL_ERRORS;
+        symOptions     = SymSetOptions( symOptions );
+        char buf[1024] = { 0 };
+        if ( SymGetSearchPath( GetCurrentProcess(), buf, sizeof( buf ) ) == FALSE )
+            printf( "ERROR: SymGetSearchPath (%d)\n", GetLastError() );
+
+        // First try to load modules from toolhelp32
+        BOOL loaded = StackTrace::GetModuleListTH32( GetCurrentProcess(), GetCurrentProcessId() );
+
+        // Try to load from Psapi
+        if ( !loaded )
+            loaded = StackTrace::GetModuleListPSAPI( GetCurrentProcess() );
+    }
+}
+#endif
+
+
+/****************************************************************************
+ *  Get the signal name                                                      *
+ ****************************************************************************/
+static char signalNames[128][32];
+const char *StackTrace::signalName( int sig )
+{
+    static bool initialized = false;
+    if ( !initialized ) {
+        StackTrace_mutex.lock();
+        memset( signalNames, 0, sizeof( signalNames ) );
+        for ( int i = 0; i < 128; i++ )
+            strcpy( signalNames[i], strsignal( i + 1 ) );
+        StackTrace_mutex.unlock();
+        initialized = true;
+    }
+    bool valid = sig > 0 && sig <= 128;
+    return valid ? signalNames[sig - 1] : nullptr;
+}
+std::vector<int> StackTrace::allSignalsToCatch()
+{
+    std::vector<int> signals;
+    signals.reserve( SIGRTMAX );
+    for ( int i = 1; i < 32; i++ ) {
+        if ( i == SIGKILL || i == SIGSTOP )
+            continue;
+        signals.push_back( i );
+    }
+    for ( int i = SIGRTMIN; i <= SIGRTMAX; i++ ) {
+        if ( i == SIGKILL || i == SIGSTOP )
+            continue;
+        signals.push_back( i );
+    }
+    return signals;
+}
+template<class TYPE>
+static inline void erase( std::vector<TYPE> &x, TYPE y )
+{
+    x.erase( std::find( x.begin(), x.end(), y ) );
+}
+std::vector<int> StackTrace::defaultSignalsToCatch()
+{
+    auto signals = allSignalsToCatch();
+    erase( signals, SIGWINCH ); // Don't catch window changed by default
+    erase( signals, SIGCONT );  // Don't catch continue by default
+    erase( signals, SIGCHLD );  // Don't catch child exited by default
+    return signals;
+}
+
+
+/****************************************************************************
+ *  Set the signal handlers                                                  *
+ ****************************************************************************/
+static std::function<void( const StackTrace::abort_error &err )> abort_fun;
+static StackTrace::abort_error rethrow()
+{
+    StackTrace::abort_error error;
+#ifdef USE_LINUX
+    try {
+        static int tried_throw = 0;
+        if ( tried_throw == 0 ) {
+            tried_throw = 1;
+            throw;
+        }
+        // No active exception
+    } catch ( const StackTrace::abort_error &err ) {
+        // Caught a std::runtime_error
+        error = err;
+    } catch ( const std::exception &err ) {
+        // Caught a std::runtime_error
+        error.message = err.what();
+    } catch ( ... ) {
+        // Caught an unknown exception
+        error.message = "Unknown exception";
+    }
+#else
+    error.message = "Unknown exception";
+#endif
+    if ( error.type == StackTrace::terminateType::unknown )
+        error.type = StackTrace::terminateType::exception;
+    if ( error.bytes == 0 )
+        error.bytes = StackTrace::Utilities::getMemoryUsage();
+    if ( error.stack.empty() ) {
+        error.stackType = StackTrace::printStackType::local;
+        error.stack     = StackTrace::backtrace();
+    }
+    return error;
+}
+static void term_func_abort( int sig )
+{
+    StackTrace::abort_error err;
+    err.type      = StackTrace::terminateType::signal;
+    err.signal    = sig;
+    err.bytes     = StackTrace::Utilities::getMemoryUsage();
+    err.stack     = StackTrace::backtrace();
+    err.stackType = StackTrace::printStackType::global;
+    abort_fun( err );
+}
+static bool signals_set[256] = { false };
+static void term_func()
+{
+    auto err = rethrow();
+    StackTrace::clearSignals();
+    abort_fun( err );
+}
+static void null_term_func() {}
+void StackTrace::clearSignal( int sig )
+{
+    if ( signals_set[sig] ) {
+        signal( sig, SIG_DFL );
+        signals_set[sig] = false;
+    }
+}
+void StackTrace::clearSignals( const std::vector<int> &signals )
+{
+    for ( auto sig : signals ) {
+        signal( sig, SIG_DFL );
+        signals_set[sig] = false;
+    }
+}
+void StackTrace::clearSignals()
+{
+    for ( size_t i = 0; i < sizeof( signals_set ); i++ ) {
+        if ( signals_set[i] ) {
+            signal( i, SIG_DFL );
+            signals_set[i] = false;
+        }
+    }
+}
+void StackTrace::setSignals( const std::vector<int> &signals, void ( *handler )( int ) )
+{
+    for ( auto sig : signals ) {
+        signal( sig, handler );
+        signals_set[sig] = true;
+    }
+    std::this_thread::yield();
+}
+void StackTrace::raiseSignal( int signal ) { std::raise( signal ); }
+void StackTrace::setErrorHandler( std::function<void( const StackTrace::abort_error & )> abort )
+{
+    abort_fun = abort;
+    std::set_terminate( term_func );
+    setSignals( defaultSignalsToCatch(), &term_func_abort );
+    std::set_unexpected( term_func );
+}
+void StackTrace::clearErrorHandler()
+{
+    abort_fun = []( const StackTrace::abort_error & ) {};
+    std::set_terminate( null_term_func );
+    clearSignals();
+    std::set_unexpected( null_term_func );
+}
+
+
+/****************************************************************************
+ *  Functions to handle MPI errors                                           *
+ ****************************************************************************/
+#ifdef USE_MPI
+static bool MPI_Initialized()
+{
+    int initialized = 0, finalized = 0;
+    MPI_Initialized( &initialized );
+    MPI_Finalized( &finalized );
+    return initialized != 0 && finalized == 0;
+}
+static std::shared_ptr<MPI_Errhandler> mpierr;
+static void MPI_error_handler_fun( MPI_Comm *comm, int *err, ... )
+{
+    if ( *err == MPI_ERR_COMM && *comm == MPI_COMM_WORLD ) {
+        // Special error handling for an invalid MPI_COMM_WORLD
+        std::cerr << "Error invalid MPI_COMM_WORLD";
+        exit( -1 );
+    }
+    int msg_len        = 0;
+    char message[1000] = { 0 };
+    MPI_Error_string( *err, message, &msg_len );
+    StackTrace::abort_error error;
+    error.message   = std::string( message );
+    error.type      = StackTrace::terminateType::MPI;
+    error.bytes     = StackTrace::Utilities::getMemoryUsage();
+    error.stack     = StackTrace::backtrace();
+    error.stackType = StackTrace::printStackType::global;
+    throw error;
+}
+void StackTrace::setMPIErrorHandler( MPI_Comm comm )
+{
+    if ( !MPI_Initialized() )
+        return;
+    if ( mpierr.get() == nullptr ) {
+        mpierr = std::make_shared<MPI_Errhandler>();
+        MPI_Comm_create_errhandler( MPI_error_handler_fun, mpierr.get() );
+    }
+    MPI_Comm_set_errhandler( comm, *mpierr );
+}
+void StackTrace::clearMPIErrorHandler( MPI_Comm comm )
+{
+    if ( !MPI_Initialized() )
+        return;
+    if ( mpierr.get() != nullptr )
+        MPI_Errhandler_free( mpierr.get() ); // Delete the error handler
+    mpierr.reset();
+    MPI_Comm_set_errhandler( comm, MPI_ERRORS_ARE_FATAL );
+}
+#else
+void StackTrace::setMPIErrorHandler( MPI_Comm ) {}
+void StackTrace::clearMPIErrorHandler( MPI_Comm ) {}
+#endif
+
+
+/****************************************************************************
+ *  Global call stack functionallity                                         *
+ ****************************************************************************/
+#ifdef USE_MPI
+static MPI_Comm globalCommForGlobalCommStack  = MPI_COMM_NULL;
+static volatile int globalMonitorThreadStatus = -1;
+static void runGlobalMonitorThread()
+{
+    int rank = 0;
+    int size = 1;
+    MPI_Comm_size( globalCommForGlobalCommStack, &size );
+    MPI_Comm_rank( globalCommForGlobalCommStack, &rank );
+    while ( globalMonitorThreadStatus == 1 ) {
+        // Check for any messages
+        int flag = 0;
+        MPI_Status status;
+        int err = MPI_Iprobe( MPI_ANY_SOURCE, 1, globalCommForGlobalCommStack, &flag, &status );
+        if ( err != MPI_SUCCESS ) {
+            printf( "Internal error in StackTrace::getGlobalCallStacks::runGlobalMonitorThread\n" );
+            break;
+        } else if ( flag != 0 ) {
+            // We received a request
+            int src_rank = status.MPI_SOURCE;
+            int tag;
+            MPI_Recv( &tag, 1, MPI_INT, src_rank, 1, globalCommForGlobalCommStack, &status );
+            // Get the list of threads (except this)
+            auto threads = getActiveThreads();
+            if ( threads.empty() )
+                continue;
+            // Get the stack info for the threads
+            auto multistack = generateMultiStack( threads );
+            // Pack and send the data
+            size_t bytes = multistack.size();
+            char *data   = new char[bytes];
+            multistack.pack( data );
+            MPI_Send( data, bytes, MPI_CHAR, src_rank, tag, globalCommForGlobalCommStack );
+            delete[] data;
+        } else {
+            // No requests recieved
+            std::this_thread::sleep_for( std::chrono::milliseconds( 50 ) );
+        }
+    }
+}
+void StackTrace::globalCallStackInitialize( MPI_Comm comm )
+{
+    globalMonitorThreadStatus = 3;
+    // Check that we have the necessary MPI thread support
+    if ( !MPI_Initialized() ) {
+        printf( "Warning: MPI not initialized before calling globalCallStackInitialize\n" );
+        return;
+    }
+    int rank = 0;
+    MPI_Comm_rank( comm, &rank );
+    int provided;
+    MPI_Query_thread( &provided );
+    if ( provided != MPI_THREAD_MULTIPLE ) {
+        if ( rank == 0 )
+            printf( "Warning: getGlobalCallStacks requires support for MPI_THREAD_MULTIPLE\n" );
+        return;
+    }
+    // Check that we have support to get call stacks from threads
+    int N_threads = 0;
+    if ( rank == 0 ) {
+        std::thread thread( StackTrace::Utilities::sleep_ms, 200 );
+        std::this_thread::yield();
+        auto thread_ids = getActiveThreads();
+        N_threads       = thread_ids.size();
+        thread.join();
+    }
+    MPI_Bcast( &N_threads, 1, MPI_INT, 0, comm );
+    if ( N_threads == 1 ) {
+        if ( rank == 0 )
+            printf( "Warning: getAllCallStacks not supported on this OS\n" );
+        return;
+    }
+    // Create the communicator and initialize the helper thread
+    globalMonitorThreadStatus = 1;
+    MPI_Comm_dup( comm, &globalCommForGlobalCommStack );
+    globalMonitorThread.reset( new std::thread( runGlobalMonitorThread ) );
+    std::this_thread::sleep_for( std::chrono::milliseconds( 50 ) );
+}
+void StackTrace::globalCallStackFinalize()
+{
+    if ( globalMonitorThread ) {
+        globalMonitorThreadStatus = 2;
+        globalMonitorThread->join();
+        globalMonitorThread.reset();
+    }
+    if ( globalCommForGlobalCommStack != MPI_COMM_NULL )
+        MPI_Comm_free( &globalCommForGlobalCommStack );
+    globalCommForGlobalCommStack = MPI_COMM_NULL;
+}
+StackTrace::multi_stack_info getRemoteCallStacks()
+{
+    if ( globalMonitorThreadStatus == -1 ) {
+        // User did not call globalCallStackInitialize
+        printf( "Warning: getGlobalCallStacks called without call to globalCallStackInitialize\n" );
+        return StackTrace::multi_stack_info();
+    } else if ( globalMonitorThreadStatus != 1 ) {
+        // globalCallStackInitialize is not supported
+        return StackTrace::multi_stack_info();
+    }
+    // Signal all processes that we want their stack for all threads
+    int rank = 0;
+    int size = 1;
+    MPI_Comm_size( globalCommForGlobalCommStack, &size );
+    MPI_Comm_rank( globalCommForGlobalCommStack, &rank );
+    std::random_device rd;
+    std::mt19937 gen( rd() );
+    std::uniform_int_distribution<> dis( 2, 0x7FFF );
+    int tag = dis( gen );
+    std::vector<MPI_Request> sendRequest( size );
+    for ( int i = 0; i < size; i++ ) {
+        if ( i == rank )
+            continue;
+        MPI_Isend( &tag, 1, MPI_INT, i, 1, globalCommForGlobalCommStack, &sendRequest[i] );
+    }
+    // Recieve the backtrace for all remote processes/threads
+    int N_finished        = 1;
+    auto start            = std::chrono::steady_clock::now();
+    double time           = 0;
+    const double max_time = 10.0 + size * 20e-3;
+    StackTrace::multi_stack_info multistack;
+    while ( N_finished < size && time < max_time ) {
+        int flag = 0;
+        MPI_Status status;
+        int err = MPI_Iprobe( MPI_ANY_SOURCE, tag, globalCommForGlobalCommStack, &flag, &status );
+        if ( err != MPI_SUCCESS ) {
+            printf( "Internal error in StackTrace::getGlobalCallStacks\n" );
+            break;
+        } else if ( flag != 0 ) {
+            // We recieved a response
+            int src_rank = status.MPI_SOURCE;
+            int count;
+            MPI_Get_count( &status, MPI_CHAR, &count );
+            char *data = new char[count];
+            MPI_Recv( data, count, MPI_CHAR, src_rank, tag, globalCommForGlobalCommStack, &status );
+            StackTrace::multi_stack_info tmp;
+            tmp.unpack( data );
+            delete[] data;
+            multistack.add( tmp );
+            N_finished++;
+        } else {
+            auto stop = std::chrono::steady_clock::now();
+            time      = std::chrono::duration_cast<std::chrono::seconds>( stop - start ).count();
+            std::this_thread::yield();
+        }
+    }
+    for ( int i = 0; i < size; i++ ) {
+        if ( i == rank )
+            continue;
+        MPI_Request_free( &sendRequest[i] );
+    }
+    return multistack;
+}
+#else
+void StackTrace::globalCallStackInitialize( MPI_Comm ) {}
+void StackTrace::globalCallStackFinalize() {}
+StackTrace::multi_stack_info getRemoteCallStacks() { return StackTrace::multi_stack_info(); }
+#endif
+StackTrace::multi_stack_info StackTrace::getGlobalCallStacks()
+{
+    auto threads    = getActiveThreads();
+    auto multistack = generateMultiStack( threads );
+    multistack.add( getRemoteCallStacks() );
+    return multistack;
+}
+
+
+/****************************************************************************
+ *  Cleanup the call stack                                                   *
+ ****************************************************************************/
+static constexpr size_t findMatching( const char *str, size_t N, size_t pos ) noexcept
+{
+    size_t pos2 = pos + 1;
+    int count   = 1;
+    while ( count != 0 && pos2 < N ) {
+        if ( str[pos2] == '<' )
+            count++;
+        if ( str[pos2] == '>' )
+            count--;
+        pos2++;
+    }
+    return pos2;
+}
+template<std::size_t N>
+static constexpr size_t findMatching( const std::array<char, N> &str, size_t pos ) noexcept
+{
+    return findMatching( str.data(), N );
+}
+static void cleanupFunctionName( char *function )
+{
+    constexpr size_t npos = std::string::npos;
+    // First find the string length
+    size_t N = strlen( function );
+    // Cleanup template space
+    strrep( function, N, " >", ">" );
+    strrep( function, N, "< ", "<" );
+    // Remove std::__1::
+    strrep( function, N, "std::__1::", "std::" );
+    // Replace std::ratio with abbriviated version
+    auto find = [&function, &N]( const string_view &str, size_t pos = 0 ) {
+        return string_view( function, N ).find( str, pos );
+    };
+    if ( find( "std::ratio<" ) != npos ) {
+        strrep( function, N, "std::ratio<1l, 1000000000000000000000000l>", "std::yocto" );
+        strrep( function, N, "std::ratio<1l, 1000000000000000000000l>", "std::zepto" );
+        strrep( function, N, "std::ratio<1l, 1000000000000000000l>", "std::atto" );
+        strrep( function, N, "std::ratio<1l, 1000000000000000l>", "std::femto" );
+        strrep( function, N, "std::ratio<1l, 1000000000000l>", "std::pico" );
+        strrep( function, N, "std::ratio<1l, 1000000000l>", "std::nano" );
+        strrep( function, N, "std::ratio<1l, 1000000l>", "std::micro" );
+        strrep( function, N, "std::ratio<1l, 1000l>", "std::milli" );
+        strrep( function, N, "std::ratio<1l, 100l>", "std::centi" );
+        strrep( function, N, "std::ratio<1l, 10l>", "std::deci" );
+        strrep( function, N, "std::ratio<1l, 1l>", "" );
+        strrep( function, N, "std::ratio<10l, 1l>", "std::deca" );
+        strrep( function, N, "std::ratio<60l, 1l>", "std::ratio<60>" );
+        strrep( function, N, "std::ratio<100l, 1l>", "std::hecto" );
+        strrep( function, N, "std::ratio<1000l, 1l>", "std::kilo" );
+        strrep( function, N, "std::ratio<3600l, 1l>", "std::ratio<3600>" );
+        strrep( function, N, "std::ratio<1000000l, 1l>", "std::mega" );
+        strrep( function, N, "std::ratio<1000000000l, 1l>", "std::giga" );
+        strrep( function, N, "std::ratio<1000000000000l, 1l>", "std::tera" );
+        strrep( function, N, "std::ratio<1000000000000000l, 1l>", "std::peta" );
+        strrep( function, N, "std::ratio<1000000000000000000l, 1l>", "std::exa" );
+        strrep( function, N, "std::ratio<1000000000000000000000l, 1l>", "std::zetta" );
+        strrep( function, N, "std::ratio<1000000000000000000000000l, 1l>", "std::yotta" );
+        strrep( function, N, " >", ">" );
+        strrep( function, N, "< ", "<" );
+    }
+    // Replace std::chrono::duration with abbriviated version
+    if ( find( "std::chrono::duration<" ) != npos ) {
+        // clang-format off
+        strrep( function, N, "std::chrono::duration<long, std::nano>", "std::chrono::nanoseconds" );
+        strrep( function, N, "std::chrono::duration<long, std::micro>", "std::chrono::microseconds" );
+        strrep( function, N, "std::chrono::duration<long, std::milli>", "std::chrono::milliseconds" );
+        strrep( function, N, "std::chrono::duration<long>", "std::chrono::seconds" );
+        strrep( function, N, "std::chrono::duration<long,>", "std::chrono::seconds" );
+        strrep( function, N, "std::chrono::duration<long, std::ratio<60>>", "std::chrono::minutes" );
+        strrep( function, N, "std::chrono::duration<long, std::ratio<3600>>", "std::chrono::hours" );
+        strrep( function, N, " >", ">" );
+        strrep( function, N, "< ", "<" );
+        // clang-format on
+    }
+    // Replace std::this_thread::sleep_for with abbriviated version.
+    if ( find( "::sleep_for<" ) != npos ) {
+        strrep( function, N, "::sleep_for<long, std::nano>", "::sleep_for<nanoseconds>" );
+        strrep( function, N, "::sleep_for<long, std::micro>", "::sleep_for<microseconds>" );
+        strrep( function, N, "::sleep_for<long, std::milli>", "::sleep_for<milliseconds>" );
+        strrep( function, N, "::sleep_for<long>", "::sleep_for<seconds>" );
+        strrep( function, N, "::sleep_for<long,>", "::sleep_for<seconds>" );
+        strrep( function, N, "::sleep_for<long, std::ratio<60>>", "::sleep_for<minutes>" );
+        strrep( function, N, "::sleep_for<long, std::ratio<3600>>", "::sleep_for<hours>" );
+        strrep( function, N, "::sleep_for<nanoseconds>(std::chrono::nanoseconds",
+            "::sleep_for(std::chrono::nanoseconds" );
+        strrep( function, N, "::sleep_for<microseconds>(std::chrono::microseconds",
+            "::sleep_for(std::chrono::microseconds" );
+        strrep( function, N, "::sleep_for<milliseconds>(std::chrono::milliseconds",
+            "::sleep_for(std::chrono::milliseconds" );
+        strrep( function, N, "::sleep_for<seconds>(std::chrono::seconds",
+            "::sleep_for(std::chrono::seconds" );
+        strrep( function, N, "::sleep_for<milliseconds>(std::chrono::minutes",
+            "::sleep_for(std::chrono::milliseconds" );
+        strrep( function, N, "::sleep_for<milliseconds>(std::chrono::hours",
+            "::sleep_for(std::chrono::hours" );
+    }
+    // Replace std::basic_string with abbriviated version
+    strrep( function, N, "std::__cxx11::basic_string<", "std::basic_string<" );
+    size_t pos = 0;
+    while ( pos < N ) {
+        // Find next instance of std::basic_string
+        pos = find( "std::basic_string<", pos );
+        if ( pos == npos )
+            break;
+        // Find the matching >
+        size_t pos1 = pos + 17;
+        size_t pos2 = findMatching( function, N, pos1 );
+        if ( pos2 == pos1 )
+            break;
+        if ( strncmp( &function[pos1 + 1], "char", 4 ) == 0 )
+            N = replace( function, N, pos, pos2 - pos, "std::string" );
+        else if ( strncmp( &function[pos1 + 1], "wchar_t", 7 ) == 0 )
+            N = replace( function, N, pos, pos2 - pos, "std::wstring" );
+        else if ( strncmp( &function[pos1 + 1], "char16_t", 8 ) == 0 )
+            N = replace( function, N, pos, pos2 - pos, "std::u16string" );
+        else if ( strncmp( &function[pos1 + 1], "char32_t", 8 ) == 0 )
+            N = replace( function, N, pos, pos2 - pos, "std::u32string" );
+        pos++;
+    }
+    // Replace std::make_shared with abbriviated version
+    if ( find( "std::make_shared<" ) != npos ) {
+        size_t pos1 = find( "std::make_shared<" );
+        size_t pos2 = find( ",", pos1 );
+        size_t pos3 = find( "(", pos1 );
+        N           = replace( function, N, pos2, pos3 - pos2, ">" );
+    }
+    // Remove std::allocator in std::vector
+    if ( find( "std::vector<" ) != npos ) {
+        size_t pos1 = find( "std::vector<" );
+        size_t pos2 = find( ", std::allocator", pos1 );
+        size_t pos3 = findMatching( function, N, pos1 + 11 );
+        N           = replace( function, N, pos2, pos3 - pos2, ">" );
+    }
+}
+void StackTrace::cleanupStackTrace( multi_stack_info &stack )
+{
+    auto it           = stack.children.begin();
+    const size_t npos = std::string::npos;
+    while ( it != stack.children.end() ) {
+        string_view object( it->stack.object.data() );
+        string_view function( it->stack.function.data() );
+        string_view filename( it->stack.filename.data() );
+        bool remove_entry = false;
+        // Remove StackTrace functions
+        if ( filename == "StackTrace.cpp" ) {
+            // Remove callstack (and all children) for threads that are just contributing
+            bool test = function.find( "_callstack_signal_handler" ) != npos ||
+                        function.find( "getGlobalCallStacks" ) != npos ||
+                        function.find( "(" ) == npos;
+            if ( test ) {
+                it = stack.children.erase( it );
+                continue;
+            }
+            // Remove backtrace_thread
+            if ( function.find( "backtrace_thread" ) != npos )
+                remove_entry = true;
+        }
+        // Remove libc functions
+        if ( object.find( "libc.so" ) != npos ) {
+            // Remove __libc_start_main
+            if ( function.find( "__libc_start_main" ) != npos )
+                remove_entry = true;
+            // Remove libc fgets children
+            if ( function.find( "fgets" ) != npos )
+                it->children.clear();
+        }
+        // Remove libc++ functions
+        if ( object.find( "libstdc++" ) != npos ) {
+            // Remove std::this_thread::__sleep_for
+            if ( function.find( "std::this_thread::__sleep_for(" ) != npos )
+                remove_entry = true;
+        }
+        // Remove pthread functions
+        if ( object.find( "libpthread" ) != npos ) {
+            // Remove __restore_rt
+            if ( function.find( "__restore_rt" ) != npos && object.find( "libpthread" ) != npos )
+                remove_entry = true;
+        }
+        // Remove condition_variable functions
+        if ( filename == "condition_variable" ) {
+            // Remove std::condition_variable::__wait_until_impl
+            if ( function.find( "std::condition_variable::__wait_until_impl" ) != npos )
+                remove_entry = true;
+        }
+        // Remove std::function references
+        if ( filename == "functional" ) {
+            remove_entry = remove_entry || function.find( "std::_Function_handler<" ) != npos;
+            remove_entry = remove_entry || function.find( "std::_Bind_simple<" ) != npos;
+            remove_entry = remove_entry || function.find( "_M_invoke" ) != npos;
+        }
+        // Remove std::thread::_Impl
+        if ( filename == "thread" ) {
+            if ( function.find( "std::thread::_Impl<" ) != npos ||
+                 function.find( "std::thread::_Invoker<" ) != npos )
+                remove_entry = true;
+        }
+        if ( filename == "invoke.h" ) {
+            remove_entry = remove_entry || function.find( "std::__invoke_impl" ) != npos;
+            remove_entry = remove_entry || function.find( "std::__invoke_result" ) != npos;
+        }
+        // Remove pthread internals
+        if ( function == "__GI___pthread_timedjoin_ex" )
+            remove_entry = true;
+        // Remove MPI internal routines
+        if ( function == "MPIR_Barrier_impl" || function == "MPIR_Barrier_intra" ||
+             function == "MPIC_Sendrecv" )
+            remove_entry = true;
+        // Remove OpenMPI specific internal routines
+        if ( function == "opal_libevent2022_event_set_log_callback" ||
+             function == "opal_libevent2022_event_base_loop" )
+            remove_entry = true;
+        // Remove MATLAB internal routines
+        if ( object == "libmwmcr.so" || object == "libmwm_lxe.so" || object == "libmwbridge.so" ||
+             object == "libmwiqm.so" )
+            remove_entry = true;
+        // Remove std::shared_ptr functions
+        if ( filename == "shared_ptr.h" ) {
+            if ( function.find( "> std::allocate_shared<" ) != npos ||
+                 function.find( "std::_Sp_make_shared_tag," ) != npos )
+                remove_entry = true;
+        }
+        if ( filename == "shared_ptr_base.h" )
+            remove_entry = true;
+        // Remove new_allocator functions
+        if ( filename == "new_allocator.h" )
+            remove_entry = true;
+        // Remove alloc_traits functions
+        if ( filename == "alloc_traits.h" )
+            remove_entry = true;
+        // Remove gthr-default functions
+        if ( filename == "gthr-default.h" )
+            remove_entry = true;
+        // Remove entries with no useful information
+        if ( function.empty() && filename.empty() )
+            remove_entry = true;
+        // Remove the desired entry
+        if ( remove_entry ) {
+            if ( it->children.empty() ) {
+                it = stack.children.erase( it );
+                continue;
+            } else if ( it->children.size() == 1 ) {
+                *it = it->children[0];
+                continue;
+            }
+        }
+        // Cleanup the children
+        cleanupStackTrace( *it );
+        // Combine any children with the same address (can occur when we remove items)
+        bool remove = false;
+        for ( auto it2 = stack.children.begin(); it2 != it; it2++ ) {
+            if ( it->stack == it2->stack ) {
+                remove = true;
+                it2->N += it->N;
+                for ( auto &tmp : it->children )
+                    it2->children.push_back( tmp );
+                cleanupStackTrace( *it2 );
+            }
+        }
+        if ( remove ) {
+            it = stack.children.erase( it );
+            continue;
+        }
+        ++it;
+    }
+}
+
+
+/****************************************************************************
+ *  Generate stack from string                                               *
+ ****************************************************************************/
+static StackTrace::stack_info parseLine( const char *str )
+{
+    char tmp[1000];
+    StackTrace::stack_info stack;
+    // Load the address
+    const char *p0 = strchr( str, 0 );
+    const char *p1 = strchr( str, 'x' );
+    const char *p2 = strchr( str, ':' );
+    memset( tmp, 0, sizeof( tmp ) );
+    memcpy( tmp, p1 + 1, p2 - p1 - 1 );
+    uint64_t address = strtol( tmp, nullptr, 16 );
+    stack.address    = reinterpret_cast<void *>( address );
+    stack.address2   = stack.address;
+    // Load object, function, file
+    const char *p3 = p2 + 1;
+    while ( *p3 == ' ' )
+        p3++;
+    if ( *p3 == 0 )
+        return stack;
+    const char *p4 = strstr( p3, "  " );
+    const char *p5 = nullptr;
+    if ( p4 != nullptr ) {
+        while ( *p4 == ' ' )
+            p4++;
+        p5 = strstr( p4, "  " );
+        if ( p5 != nullptr ) {
+            while ( *p5 == ' ' )
+                p5++;
+        }
+    }
+    if ( p5 == nullptr ) {
+        if ( p3 - p2 > 20 ) {
+            p5 = p4;
+            p4 = p3;
+        }
+    }
+    if ( p4 == nullptr )
+        p4 = p0;
+    if ( p5 == nullptr )
+        p5 = p0;
+    // Load line
+    const char *p6 = strchr( p5, ':' );
+    if ( p6 == nullptr )
+        p6 = p0;
+    // Store the results
+    auto copyField = []( const char *p1, const char *p2, auto &field ) {
+        field.fill( 0 );
+        memcpy( field.data(), p1, std::min<int>( p2 - p1, field.size() ) );
+        for ( int i = field.size() - 1; i > 0 && ( field[i] == ' ' || field[i] == 0 ); i-- )
+            field[i] = 0;
+    };
+    copyField( p3, p4, stack.object );
+    copyField( p4, p5, stack.function );
+    copyField( p5, p6, stack.filename );
+    if ( p6 != p0 )
+        stack.line = atoi( p6 + 1 );
+    return stack;
+}
+StackTrace::multi_stack_info StackTrace::generateFromString( const std::string &str )
+{
+    // Break the string according to line breaks
+    std::vector<std::string> data;
+    size_t p1 = 0;
+    size_t p2 = str.find( '\n' );
+    while ( p2 != std::string::npos ) {
+        data.push_back( str.substr( p1, p2 - p1 ) );
+        p1 = p2 + 1;
+        p2 = str.find( '\n', p1 );
+    }
+    data.push_back( str.substr( p1 ) );
+    // Generate the stack
+    return generateFromString( data );
+}
+StackTrace::multi_stack_info StackTrace::generateFromString( const std::vector<std::string> &text )
+{
+    // Get the data from the text
+    std::vector<int> indent;
+    std::vector<multi_stack_info> stack;
+    for ( const auto &str : text ) {
+        auto p1 = str.find( '[' );
+        auto p2 = str.find( ']' );
+        auto p3 = str.find( 'x' );
+        if ( p3 == std::string::npos )
+            continue;
+        multi_stack_info tmp;
+        tmp.N = 1;
+        if ( p1 < p2 && p1 < p3 )
+            tmp.N = std::stoi( str.substr( p1 + 1, p2 - p1 - 1 ) );
+        tmp.stack = parseLine( &str[p3 - 1] );
+        indent.push_back( std::min( p1, p3 - 1 ) );
+        stack.push_back( tmp );
+    }
+    // Generate the stack hierarchy
+    multi_stack_info stack2;
+    std::vector<std::pair<int, std::vector<multi_stack_info> *>> map;
+    map.emplace_back( 0, &stack2.children );
+    for ( size_t i = 0; i < stack.size(); i++ ) {
+        while ( indent[i] < map.back().first )
+            map.resize( map.size() - 1 );
+        if ( indent[i] == map.back().first ) {
+            map.back().second->push_back( stack[i] );
+        } else {
+            map.back().second->back().children.push_back( stack[i] );
+            map.emplace_back( indent[i], &map.back().second->back().children );
+        }
+    }
+    return stack2;
+}
+
+
+/****************************************************************************
+ *  abort_error                                                              *
+ ****************************************************************************/
+StackTrace::abort_error::abort_error()
+    : type( terminateType::unknown ), signal( 0 ), line( -1 ), bytes( 0 )
+{
+}
+const char *StackTrace::abort_error::what() const noexcept
+{
+    d_msg.clear();
+    if ( type == terminateType::abort ) {
+        d_msg += "Program abort called";
+    } else if ( type == terminateType::signal ) {
+        d_msg += "Unhandled signal (" + std::to_string( signal ) + ") caught";
+    } else if ( type == terminateType::exception ) {
+        d_msg += "Unhandled exception caught";
+    } else if ( type == terminateType::MPI ) {
+        d_msg += "Error calling MPI routine";
+    } else {
+        d_msg += "Unknown error called";
+    }
+    if ( !filename.empty() ) {
+        d_msg += " in file '" + filename + "'";
+        if ( line > 0 ) {
+            d_msg += " at line " + std::to_string( line );
+        }
+    }
+    d_msg += ":\n";
+    d_msg += "   " + message + "\n";
+    if ( bytes > 0 ) {
+        d_msg += "Bytes used = " + std::to_string( bytes ) + "\n";
+    }
+    if ( !stack.empty() ) {
+        d_msg += "Stack Trace:\n";
+        if ( stackType == printStackType::local ) {
+            for ( const auto &item : getStackInfo( stack ) ) {
+                char txt[1000];
+                item.print2( txt );
+                d_msg += " \n";
+                d_msg += txt;
+            }
+        } else if ( stackType == printStackType::threaded || stackType == printStackType::global ) {
+            // Get the call stack
+            std::vector<std::vector<void *>> trace;
+            trace.push_back( stack );
+            // Get the call stack for all threads except the current one
+            auto threads = getActiveThreads();
+            threads.erase( thisThread() );
+            for ( auto tid : threads )
+                trace.push_back( backtrace( tid ) );
+            // Generate call stack
+            auto multistack = generateMultiStack( trace );
+            // Add remote call stack info
+            if ( stackType == printStackType::global )
+                multistack.add( getRemoteCallStacks() );
+            // Cleanup call stack
+            cleanupStackTrace( multistack );
+            // Print the results
+            d_msg += multistack.printString( " " );
+        } else {
+            d_msg += "Unknown value for stackType\n";
+        }
+    }
+    for ( size_t i = 0; i < d_msg.size(); i++ )
+        if ( d_msg[i] == 0 )
+            d_msg.erase( i, 1 );
+    return d_msg.c_str();
+}
diff --git a/common/StackTrace.h b/StackTrace/StackTrace.h
similarity index 59%
rename from common/StackTrace.h
rename to StackTrace/StackTrace.h
index 8d436bf7..ce315020 100644
--- a/common/StackTrace.h
+++ b/StackTrace/StackTrace.h
@@ -1,41 +1,30 @@
 #ifndef included_StackTrace
 #define included_StackTrace
 
+#include <array>
 #include <functional>
 #include <iostream>
 #include <set>
 #include <thread>
 #include <vector>
 
-
-// Check for and include MPI
-// clang-format off
-#if defined(USE_MPI) || defined(USE_EXT_MPI)
-    #include "mpi.h"
-#elif defined(__has_include)
-    #if __has_include("mpi.h")
-        #include "mpi.h"
-    #else
-        typedef int MPI_Comm;
-    #endif
-#else
-    typedef int MPI_Comm;
-#endif
-// clang-format on
+#include "StackTrace/string_view.h"
 
 
 namespace StackTrace {
 
-
+//! Class to contain stack trace info for a single thread/process
 struct stack_info {
+    uint32_t line;
     void *address;
     void *address2;
-    std::string object;
-    std::string function;
-    std::string filename;
-    int line;
+    std::array<char, 56> object;
+    std::array<char, 48> objectPath;
+    std::array<char, 64> filename;
+    std::array<char, 64> filenamePath;
+    std::array<char, 256> function;
     //! Default constructor
-    stack_info() : address( nullptr ), address2( nullptr ), line( 0 ) {}
+    stack_info();
     //! Reset the stack
     void clear();
     //! Operator==
@@ -46,19 +35,22 @@ struct stack_info {
     int getAddressWidth() const;
     //! Print the stack info
     std::string print( int widthAddress = 16, int widthObject = 20, int widthFunction = 32 ) const;
+    //! Print the stack info
+    static void print( std::ostream &out, const std::vector<stack_info> &stack,
+        const StackTrace::string_view &prefix = "" );
+    //! Print the stack info
+    void print2(
+        char *txt, int widthAddress = 16, int widthObject = 20, int widthFunction = 32 ) const;
     //! Compute the number of bytes needed to store the object
     size_t size() const;
     //! Pack the data to a byte array, returning a pointer to the end of the data
     char *pack( char *ptr ) const;
     //! Unpack the data from a byte array, returning a pointer to the end of the data
     const char *unpack( const char *ptr );
-    //! Pack a vector of data to a memory block
-    static std::vector<char> packArray( const std::vector<stack_info> &data );
-    //! Unpack a vector of data from a memory block
-    static std::vector<stack_info> unpackArray( const char *data );
 };
 
 
+//! Class to contain stack trace info for multiple threads/processes
 struct multi_stack_info {
     int N;                                  // Number of threads/processes
     stack_info stack;                       // Current stack item
@@ -71,19 +63,69 @@ struct multi_stack_info {
     multi_stack_info &operator=( const std::vector<stack_info> & );
     //! Reset the stack
     void clear();
+    //! Is the stack empty
+    bool empty() const { return N == 0; }
     //! Add the given stack to the multistack
     void add( size_t len, const stack_info *stack );
+    //! Add the given stack to the multistack
+    void add( const multi_stack_info &stack );
+    //! Compute the number of bytes needed to store the object
+    size_t size() const;
+    //! Pack the data to a byte array, returning a pointer to the end of the data
+    char *pack( char *ptr ) const;
+    //! Unpack the data from a byte array, returning a pointer to the end of the data
+    const char *unpack( const char *ptr );
     //! Print the stack info
-    std::vector<std::string> print( const std::string &prefix = std::string() ) const;
+    std::vector<std::string> print( const StackTrace::string_view &prefix = "" ) const;
+    //! Print the stack info
+    void print( std::ostream &out, const StackTrace::string_view &prefix = "" ) const;
+    //! Print the stack info
+    std::string printString( const StackTrace::string_view &prefix = "" ) const;
 
 private:
-    void print2( const std::string &prefix, int w[3], std::vector<std::string> &text ) const;
+    template<class FUN>
+    void print2( int Np, char *prefix, int w[3], bool c, FUN &fun ) const;
     int getAddressWidth() const;
     int getObjectWidth() const;
     int getFunctionWidth() const;
 };
 
 
+//!< Terminate type
+enum class terminateType : uint8_t { signal, exception, abort, MPI, unknown };
+enum class printStackType : uint8_t { local = 1, threaded = 2, global = 3 };
+
+//!< Class to contain exception info from abort
+class abort_error : public std::exception
+{
+public:
+    std::string message;       //!< Abort message
+    std::string filename;      //!< File where abort was called
+    terminateType type;        //!< What caused the termination
+    printStackType stackType;  //!< Print the local stack, all threads, or global call stack
+    uint8_t signal;            //!< Signal number
+    int line;                  //!< Line number where abort was called
+    size_t bytes;              //!< Memory in use during abort
+    std::vector<void *> stack; //!< Local call stack for abort
+public:
+    virtual const char *what() const noexcept override;
+    abort_error();
+    virtual ~abort_error() {}
+
+private:
+    mutable std::string d_msg;
+};
+
+
+//!< Class to contain symbol information
+struct symbols_struct {
+    char type;
+    void *address;
+    std::array<char, 56> obj;
+    std::array<char, 56> objPath;
+};
+
+
 /*!
  * @brief  Get the current call stack
  * @details  This function returns the current call stack for the current thread
@@ -152,16 +194,18 @@ std::vector<stack_info> getStackInfo( const std::vector<void *> &address );
 
 
 //! Function to return the signal name
-std::string signalName( int signal );
+const char *signalName( int signal );
 
 
 /*!
  * Return the symbols from the current executable (not availible for all platforms)
- * @return      Returns 0 if sucessful
+ * @return      Returns the symbols loaded
  */
-int getSymbols( std::vector<void *> &address,
-                std::vector<char> &type,
-                std::vector<std::string> &obj );
+std::vector<symbols_struct> getSymbols();
+
+
+//! Clear internal symbol data
+void clearSymbols();
 
 
 /*!
@@ -178,20 +222,10 @@ std::string getExecutable();
 std::string getSymPaths();
 
 
-//!< Terminate type
-enum class terminateType { signal, exception };
-
-/*!
- * Set the error handlers
- * @param[in] abort     Function to terminate the program: abort(msg,type)
- */
-void setErrorHandlers( std::function<void( std::string, terminateType )> abort );
-
-
 /*!
  * Set the given signals to the handler
  * @param[in] signals   Signals to handle
- * @param[in] handler   Function to terminate the program: abort(msg,type)
+ * @param[in] handler   Function to terminate the program: abort(signal)
  */
 void setSignals( const std::vector<int> &signals, void ( *handler )( int ) );
 
@@ -200,10 +234,18 @@ void setSignals( const std::vector<int> &signals, void ( *handler )( int ) );
 void clearSignal( int signal );
 
 
+//! Clear a signal set by setSignals
+void clearSignals( const std::vector<int> &signals );
+
+
 //! Clear all signals set by setSignals
 void clearSignals();
 
 
+//! Raise a signal
+void raiseSignal( int signal );
+
+
 //! Return a list of all signals that can be caught
 std::vector<int> allSignalsToCatch();
 
@@ -212,19 +254,12 @@ std::vector<int> defaultSignalsToCatch();
 
 
 //! Get a list of the active threads
-std::set<std::thread::native_handle_type> activeThreads();
+std::vector<std::thread::native_handle_type> activeThreads();
 
 //! Get a handle to this thread
 std::thread::native_handle_type thisThread();
 
 
-//! Initialize globalCallStack functionallity
-void globalCallStackInitialize( MPI_Comm comm );
-
-//! Clean up globalCallStack functionallity
-void globalCallStackFinalize();
-
-
 /*!
  * @brief  Call system command
  * @details  This function calls a system command, waits for the program
@@ -233,7 +268,25 @@ void globalCallStackFinalize();
  * @param[out] exit_code    Exit code returned from child process
  * @return                  Returns string containing the output
  */
-std::string exec( const std::string &cmd, int &exit_code );
+std::string exec( const string_view &cmd, int &exit_code );
+
+
+/*!
+ * @brief  Create stack from string
+ * @details  This function creates the call stack from the string generated by print
+ * @param[in] str           Vector of strings containing call stack
+ * @return                  Returns the call stack
+ */
+multi_stack_info generateFromString( const std::vector<std::string> &str );
+
+
+/*!
+ * @brief  Create stack from string
+ * @details  This function creates the call stack from the string
+ * @param[in] str           String containing call stack
+ * @return                  Returns the call stack
+ */
+multi_stack_info generateFromString( const std::string &str );
 
 
 } // namespace StackTrace
diff --git a/StackTrace/Utilities.cpp b/StackTrace/Utilities.cpp
new file mode 100644
index 00000000..734a0056
--- /dev/null
+++ b/StackTrace/Utilities.cpp
@@ -0,0 +1,296 @@
+#define NOMINMAX
+#include "StackTrace/Utilities.h"
+#include "StackTrace/ErrorHandlers.h"
+#include "StackTrace/StackTrace.h"
+
+#include <algorithm>
+#include <csignal>
+#include <cstring>
+#include <fstream>
+#include <iostream>
+#include <sstream>
+#include <stdexcept>
+
+#ifdef USE_MPI
+#include "mpi.h"
+#endif
+
+#ifdef USE_TIMER
+#include "MemoryApp.h"
+#endif
+
+
+#define perr std::cerr
+
+
+// Detect the OS
+// clang-format off
+#if defined( WIN32 ) || defined( _WIN32 ) || defined( WIN64 ) || defined( _WIN64 ) || defined( _MSC_VER )
+    #define USE_WINDOWS
+#elif defined( __APPLE__ )
+    #define USE_MAC
+#elif defined( __linux ) || defined( __linux__ ) || defined( __unix ) || defined( __posix )
+    #define USE_LINUX
+    #define USE_NM
+#else
+    #error Unknown OS
+#endif
+// clang-format on
+
+
+// Include system dependent headers
+// clang-format off
+#ifdef USE_WINDOWS
+    #include <process.h>
+    #include <psapi.h>
+    #include <stdio.h>
+    #include <tchar.h>
+    #include <windows.h>
+#else
+    #include <dlfcn.h>
+    #include <execinfo.h>
+    #include <sched.h>
+    #include <sys/time.h>
+    #include <ctime>
+    #include <unistd.h>
+#endif
+#ifdef USE_LINUX
+    #include <malloc.h>
+#endif
+#ifdef USE_MAC
+    #include <mach/mach.h>
+    #include <sys/sysctl.h>
+    #include <sys/types.h>
+#endif
+// clang-format on
+
+
+namespace StackTrace {
+
+
+/****************************************************************************
+ *  Function to find an entry                                                *
+ ****************************************************************************/
+template<class TYPE>
+inline size_t findfirst( const std::vector<TYPE> &X, TYPE Y )
+{
+    if ( X.empty() )
+        return 0;
+    size_t lower = 0;
+    size_t upper = X.size() - 1;
+    if ( X[lower] >= Y )
+        return lower;
+    if ( X[upper] < Y )
+        return upper;
+    while ( ( upper - lower ) != 1 ) {
+        size_t value = ( upper + lower ) / 2;
+        if ( X[value] >= Y )
+            upper = value;
+        else
+            lower = value;
+    }
+    return upper;
+}
+
+
+/****************************************************************************
+ *  Function to terminate the program                                        *
+ ****************************************************************************/
+static bool abort_throwException      = false;
+static printStackType abort_stackType = printStackType::global;
+static int force_exit                 = 0;
+void Utilities::setAbortBehavior( bool throwException, int stackType )
+{
+    abort_throwException = throwException;
+    abort_stackType      = static_cast<printStackType>( stackType );
+}
+void Utilities::abort( const std::string &message, const std::string &filename, const int line )
+{
+    abort_error err;
+    err.message   = message;
+    err.filename  = filename;
+    err.type      = terminateType::abort;
+    err.line      = line;
+    err.bytes     = Utilities::getMemoryUsage();
+    err.stackType = abort_stackType;
+    err.stack     = StackTrace::backtrace();
+    throw err;
+}
+static void terminate( const StackTrace::abort_error &err )
+{
+    clearErrorHandler();
+    // Print the message and abort
+    if ( force_exit > 1 ) {
+        std::abort();
+    } else if ( !abort_throwException ) {
+        // Use MPI_abort (will terminate all processes)
+        force_exit = 2;
+        perr << err.what();
+#if defined( USE_MPI ) || defined( HAVE_MPI )
+        int initialized = 0, finalized = 0;
+        MPI_Initialized( &initialized );
+        MPI_Finalized( &finalized );
+        if ( initialized != 0 && finalized == 0 ) {
+            clearMPIErrorHandler( MPI_COMM_WORLD );
+            MPI_Abort( MPI_COMM_WORLD, -1 );
+        }
+#endif
+        std::abort();
+    } else {
+        perr << err.what();
+        std::abort();
+    }
+}
+
+
+/****************************************************************************
+ *  Functions to set the error handler                                       *
+ ****************************************************************************/
+static void setTerminateErrorHandler()
+{
+    // Set the terminate routine for runtime errors
+    StackTrace::setErrorHandler( terminate );
+}
+void Utilities::setErrorHandlers()
+{
+#ifdef USE_MPI
+    setMPIErrorHandler( MPI_COMM_WORLD );
+    setMPIErrorHandler( MPI_COMM_SELF );
+#endif
+    setTerminateErrorHandler();
+}
+void Utilities::clearErrorHandlers()
+{
+#ifdef USE_MPI
+    clearMPIErrorHandler( MPI_COMM_WORLD );
+    clearMPIErrorHandler( MPI_COMM_SELF );
+#endif
+}
+
+
+/****************************************************************************
+ *  Function to get the memory usage                                         *
+ *  Note: this function should be thread-safe                                *
+ ****************************************************************************/
+// clang-format off
+#if defined( USE_MAC ) || defined( USE_LINUX )
+    // Get the page size on mac or linux
+    static size_t page_size = static_cast<size_t>( sysconf( _SC_PAGESIZE ) );
+#endif
+size_t Utilities::getSystemMemory()
+{
+    #if defined( USE_LINUX )
+        static long pages = sysconf( _SC_PHYS_PAGES );
+        size_t N_bytes    = pages * page_size;
+    #elif defined( USE_MAC )
+        int mib[2]    = { CTL_HW, HW_MEMSIZE };
+        u_int namelen = sizeof( mib ) / sizeof( mib[0] );
+        uint64_t size;
+        size_t len = sizeof( size );
+        size_t N_bytes = 0;
+        if ( sysctl( mib, namelen, &size, &len, nullptr, 0 ) == 0 )
+            N_bytes = size;
+    #elif defined( USE_WINDOWS )
+        MEMORYSTATUSEX status;
+        status.dwLength = sizeof( status );
+        GlobalMemoryStatusEx( &status );
+        size_t N_bytes = status.ullTotalPhys;
+    #else
+        #error Unknown OS
+    #endif
+    return N_bytes;
+}
+size_t Utilities::getMemoryUsage()
+{
+    #ifdef USE_TIMER
+        size_t N_bytes = MemoryApp::getTotalMemoryUsage();
+    #else
+        #if defined( USE_LINUX )
+            struct mallinfo meminfo = mallinfo();
+            size_t size_hblkhd      = static_cast<unsigned int>( meminfo.hblkhd );
+            size_t size_uordblks    = static_cast<unsigned int>( meminfo.uordblks );
+            size_t N_bytes          = size_hblkhd + size_uordblks;
+        #elif defined( USE_MAC )
+            struct task_basic_info t_info;
+            mach_msg_type_number_t t_info_count = TASK_BASIC_INFO_COUNT;
+            if ( KERN_SUCCESS !=
+                 task_info( mach_task_self(), TASK_BASIC_INFO, (task_info_t) &t_info, &t_info_count ) ) {
+                return 0;
+            }
+            size_t N_bytes = t_info.virtual_size;
+        #elif defined( USE_WINDOWS )
+            PROCESS_MEMORY_COUNTERS memCounter;
+            GetProcessMemoryInfo( GetCurrentProcess(), &memCounter, sizeof( memCounter ) );
+            size_t N_bytes = memCounter.WorkingSetSize;
+        #else
+            #error Unknown OS
+        #endif
+    #endif
+    return N_bytes;
+}
+// clang-format on
+
+
+/****************************************************************************
+ *  Functions to get the time and timer resolution                           *
+ ****************************************************************************/
+#if defined( USE_WINDOWS )
+double Utilities::time()
+{
+    LARGE_INTEGER end, f;
+    QueryPerformanceFrequency( &f );
+    QueryPerformanceCounter( &end );
+    double time = ( (double) end.QuadPart ) / ( (double) f.QuadPart );
+    return time;
+}
+double Utilities::tick()
+{
+    LARGE_INTEGER f;
+    QueryPerformanceFrequency( &f );
+    double resolution = ( (double) 1.0 ) / ( (double) f.QuadPart );
+    return resolution;
+}
+#elif defined( USE_LINUX ) || defined( USE_MAC )
+double Utilities::time()
+{
+    timeval current_time;
+    gettimeofday( &current_time, nullptr );
+    double time = ( (double) current_time.tv_sec ) + 1e-6 * ( (double) current_time.tv_usec );
+    return time;
+}
+double Utilities::tick()
+{
+    timeval start, end;
+    gettimeofday( &start, nullptr );
+    gettimeofday( &end, nullptr );
+    while ( end.tv_sec == start.tv_sec && end.tv_usec == start.tv_usec )
+        gettimeofday( &end, nullptr );
+    double resolution = ( (double) ( end.tv_sec - start.tv_sec ) ) +
+                        1e-6 * ( (double) ( end.tv_usec - start.tv_usec ) );
+    return resolution;
+}
+#else
+#error Unknown OS
+#endif
+
+
+/****************************************************************************
+ *  Cause a segfault                                                         *
+ ****************************************************************************/
+void Utilities::cause_segfault()
+{
+    int *ptr = nullptr;
+    ptr[0]   = 0;
+}
+
+
+/****************************************************************************
+ *  Call system command                                                      *
+ ****************************************************************************/
+std::string Utilities::exec( const string_view &cmd, int &exit_code )
+{
+    return StackTrace::exec( cmd, exit_code );
+}
+
+
+} // namespace StackTrace
diff --git a/StackTrace/Utilities.h b/StackTrace/Utilities.h
new file mode 100644
index 00000000..10ed9085
--- /dev/null
+++ b/StackTrace/Utilities.h
@@ -0,0 +1,99 @@
+#ifndef included_StackTrace_Utilities
+#define included_StackTrace_Utilities
+
+#include <stdexcept>
+#include <string>
+#include <thread>
+
+#include "StackTrace/StackTrace.h"
+#include "StackTrace/string_view.h"
+
+
+namespace StackTrace {
+namespace Utilities {
+
+
+/*!
+ * Aborts the run after printing an error message with file and
+ * line number information.
+ */
+void abort( const std::string &message, const std::string &filename, const int line );
+
+
+/*!
+ * Set the behavior of abort
+ * @param throwException    Throw an exception instead of MPI_Abort (default is false)
+ * @param stackType         Type of stack to get (1: thread local stack, 2: all threads, 3: global)
+ */
+void setAbortBehavior( bool throwException, int stackType = 2 );
+
+
+//! Function to set the error handlers
+void setErrorHandlers();
+
+//! Function to clear the error handlers
+void clearErrorHandlers();
+
+
+/*!
+ * Function to get the memory availible.
+ * This function will return the total memory availible
+ * Note: depending on the implimentation, this number may be rounded to
+ * to a multiple of the page size.
+ * If this function fails, it will return 0.
+ */
+size_t getSystemMemory();
+
+
+/*!
+ * Function to get the memory usage.
+ * This function will return the total memory used by the application.
+ * Note: depending on the implimentation, this number may be rounded to
+ * to a multiple of the page size.
+ * If this function fails, it will return 0.
+ */
+size_t getMemoryUsage();
+
+
+//! Function to get an arbitrary point in time
+double time();
+
+
+//! Function to get the resolution of time
+double tick();
+
+
+/*!
+ * Sleep for X ms
+ * @param N         Time to sleep (ms)
+ */
+inline void sleep_ms( int N ) { std::this_thread::sleep_for( std::chrono::milliseconds( N ) ); }
+
+
+/*!
+ * Sleep for X s
+ * @param N         Time to sleep (s)
+ */
+inline void sleep_s( int N ) { std::this_thread::sleep_for( std::chrono::seconds( N ) ); }
+
+
+//! Cause a segfault
+void cause_segfault();
+
+
+/*!
+ * @brief  Call system command
+ * @details  This function calls a system command, waits for the program
+ *   to execute, captures and returns the output and exit code.
+ * @param[in] cmd           Command to execute
+ * @param[out] exit_code    Exit code returned from child process
+ * @return                  Returns string containing the output
+ */
+std::string exec( const StackTrace::string_view &cmd, int &exit_code );
+
+
+} // namespace Utilities
+} // namespace StackTrace
+
+
+#endif
diff --git a/StackTrace/string_view.h b/StackTrace/string_view.h
new file mode 100644
index 00000000..d83d1f24
--- /dev/null
+++ b/StackTrace/string_view.h
@@ -0,0 +1,193 @@
+#ifndef included_StackTrace_stringView
+#define included_StackTrace_stringView
+
+#include <cstring>
+#include <ostream>
+
+namespace StackTrace {
+
+// string_view
+class string_view
+{
+public:
+    // Constants:
+    static constexpr size_t npos = size_t( -1 );
+
+    // Constructions
+    constexpr string_view() noexcept : d_data( nullptr ), d_size( 0 ) {}
+    constexpr string_view( string_view&& ) noexcept      = default;
+    constexpr string_view( const string_view& ) noexcept = default;
+    constexpr string_view( const char* s ) : d_data( s ), d_size( s ? strlen( s ) : 0 ) {}
+    constexpr string_view( const char* s, size_t count ) : d_data( s ), d_size( count ) {}
+    inline string_view( const std::string& s ) : d_data( s.data() ), d_size( s.size() ) {}
+
+    // Assignment
+    constexpr string_view& operator=( string_view&& other ) noexcept = default;
+    constexpr string_view& operator=( const string_view& other ) noexcept = default;
+
+    // Iterators
+    constexpr const char* begin() const noexcept { return d_data; }
+    constexpr const char* end() const noexcept { return d_data + d_size; }
+    constexpr const char* cbegin() const noexcept { return begin(); }
+    constexpr const char* cend() const noexcept { return end(); }
+
+    // capacity
+    constexpr size_t size() const noexcept { return d_size; }
+    constexpr size_t length() const noexcept { return d_size; }
+    constexpr bool empty() const noexcept { return d_size == 0; }
+
+    // Element access
+    constexpr const char& operator[]( size_t pos ) const
+    {
+        if ( pos >= d_size )
+            throw std::out_of_range( "string_view[]" );
+        return d_data[pos];
+    }
+    constexpr const char& at( size_t pos ) const
+    {
+        if ( pos >= d_size )
+            throw std::out_of_range( "string_view::at()" );
+        return d_data[pos];
+    }
+    constexpr const char& front() const
+    {
+        if ( d_size == 0 )
+            throw std::out_of_range( "front()" );
+        return d_data[0];
+    }
+    constexpr const char& back() const
+    {
+        if ( d_size == 0 )
+            throw std::out_of_range( "back()" );
+        return d_data[size() - 1];
+    }
+    constexpr const char* data() const noexcept { return d_data; }
+
+    // Swap data
+    void swap( string_view& other ) noexcept
+    {
+        std::swap( d_data, other.d_data );
+        std::swap( d_size, other.d_size );
+    }
+
+    // String operations
+    size_t copy( char* dest, size_t n, size_t pos = 0 ) const
+    {
+        if ( pos > size() )
+            throw std::out_of_range( "string_view::copy()" );
+        const size_t rlen = std::min( n, size() - pos );
+        memcpy( dest, data() + pos, rlen );
+        return rlen;
+    }
+    constexpr string_view substr( size_t pos = 0, size_t n = npos ) const
+    {
+        if ( pos > size() )
+            throw std::out_of_range( "string_view::substr()" );
+        return string_view( data() + pos, std::min( n, size() - pos ) );
+    }
+
+    // Find
+    constexpr size_t find( char ch, size_t pos = 0 ) const noexcept
+    {
+        for ( size_t i = pos; i < d_size; i++ )
+            if ( d_data[i] == ch )
+                return i;
+        return std::string::npos;
+    }
+    constexpr size_t find( string_view v, size_t pos = 0 ) const noexcept
+    {
+        size_t i = pos;
+        size_t N = v.size();
+        if ( N == 0 || N > ( d_size - pos ) )
+            return std::string::npos;
+        while ( i < ( d_size - N + 1 ) ) {
+            size_t j = 0;
+            for ( j = 0; j < N && i + j < d_size; j++ )
+                if ( d_data[i + j] != v[j] )
+                    break;
+            if ( j == N )
+                return i;
+            i++;
+        }
+        return std::string::npos;
+    }
+
+    // compare()
+    constexpr int compare( const string_view& other ) const noexcept
+    {
+        int N      = std::min( size(), other.size() );
+        int result = 0;
+        for ( int i = 0; i < N && result == 0; i++ )
+            if ( d_data[i] != other[i] )
+                result = d_data[i] < other[i] ? -i : i;
+        if ( result == 0 )
+            result = size() == other.size() ? 0 : size() < other.size() ? -1 : 1;
+        return result;
+    }
+    constexpr int compare( size_t pos1, size_t n1, string_view other ) const
+    {
+        return substr( pos1, n1 ).compare( other );
+    }
+    constexpr int compare( size_t pos1, size_t n1, string_view other, size_t pos2, size_t n2 ) const
+    {
+        return substr( pos1, n1 ).compare( other.substr( pos2, n2 ) );
+    }
+    constexpr int compare( char const* s ) const { return compare( string_view( s ) ); }
+    constexpr int compare( size_t pos1, size_t n1, char const* s ) const
+    {
+        return substr( pos1, n1 ).compare( string_view( s ) );
+    }
+    constexpr int compare( size_t pos1, size_t n1, char const* s, size_t n2 ) const
+    {
+        return substr( pos1, n1 ).compare( string_view( s, n2 ) );
+    }
+
+    explicit operator std::string() const { return std::string( begin(), end() ); }
+    std::string to_string() const { return std::string( begin(), end() ); }
+
+private:
+    const char* d_data;
+    size_t d_size;
+};
+
+
+// Non-member functions:
+constexpr inline bool operator==( const string_view& lhs, const string_view& rhs ) noexcept
+{
+    return lhs.compare( rhs ) == 0;
+}
+constexpr inline bool operator!=( const string_view& lhs, const string_view& rhs ) noexcept
+{
+    return lhs.compare( rhs ) != 0;
+}
+constexpr inline bool operator<( const string_view& lhs, const string_view& rhs ) noexcept
+{
+    return lhs.compare( rhs ) < 0;
+}
+
+constexpr inline bool operator<=( const string_view& lhs, const string_view& rhs ) noexcept
+{
+    return lhs.compare( rhs ) <= 0;
+}
+constexpr inline bool operator>( const string_view& lhs, const string_view& rhs ) noexcept
+{
+    return lhs.compare( rhs ) > 0;
+}
+constexpr inline bool operator>=( const string_view& lhs, const string_view& rhs ) noexcept
+{
+    return lhs.compare( rhs ) >= 0;
+}
+inline std::string to_string( const string_view& v ) { return std::string( v.begin(), v.end() ); }
+inline string_view to_string_view( std::string const& s )
+{
+    return string_view( s.data(), s.size() );
+}
+inline std::ostream& operator<<( std::ostream& out, const string_view& s )
+{
+    out << s.data();
+    return out;
+}
+
+} // namespace StackTrace
+
+#endif
diff --git a/analysis/Minkowski.cpp b/analysis/Minkowski.cpp
index 650d30dc..743e4751 100644
--- a/analysis/Minkowski.cpp
+++ b/analysis/Minkowski.cpp
@@ -1,10 +1,8 @@
 #include "analysis/Minkowski.h"
 #include "analysis/pmmc.h"
+#include "analysis/analysis.h"
 #include "common/Domain.h"
 #include "common/Communication.h"
-#include "analysis/analysis.h"
-
-#include "shared_ptr.h"
 #include "common/Utilities.h"
 #include "common/MPI_Helpers.h"
 #include "IO/MeshDatabase.h"
@@ -13,6 +11,8 @@
 
 #include "ProfilerApp.h"
 
+#include <memory>
+
 
 #define PI 3.14159265359
 
diff --git a/analysis/Minkowski.h b/analysis/Minkowski.h
index 8c39b68a..472b4489 100644
--- a/analysis/Minkowski.h
+++ b/analysis/Minkowski.h
@@ -2,6 +2,7 @@
 #ifndef Minkowski_INC
 #define Minkowski_INC
 
+#include <memory>
 #include <vector>
 
 #include "analysis/dcel.h"
@@ -9,7 +10,6 @@
 #include "common/Communication.h"
 #include "analysis/analysis.h"
 
-#include "shared_ptr.h"
 #include "common/Utilities.h"
 #include "common/MPI_Helpers.h"
 #include "IO/MeshDatabase.h"
diff --git a/analysis/TwoPhase.cpp b/analysis/TwoPhase.cpp
index cb752f07..a558fea6 100644
--- a/analysis/TwoPhase.cpp
+++ b/analysis/TwoPhase.cpp
@@ -1,17 +1,18 @@
 #include "analysis/TwoPhase.h"
 
 #include "analysis/pmmc.h"
+#include "analysis/analysis.h"
 #include "common/Domain.h"
 #include "common/Communication.h"
-#include "analysis/analysis.h"
-
-#include "shared_ptr.h"
 #include "common/Utilities.h"
 #include "common/MPI_Helpers.h"
 #include "IO/MeshDatabase.h"
 #include "IO/Reader.h"
 #include "IO/Writer.h"
 
+#include <memory>
+
+
 #define BLOB_AVG_COUNT 35
 
 // Array access for averages defined by the following
diff --git a/analysis/TwoPhase.h b/analysis/TwoPhase.h
index 01df349d..fddd04e8 100644
--- a/analysis/TwoPhase.h
+++ b/analysis/TwoPhase.h
@@ -2,16 +2,15 @@
 #ifndef TwoPhase_INC
 #define TwoPhase_INC
 
+#include <memory>
 #include <vector>
 
 #include "analysis/pmmc.h"
-#include "common/Domain.h"
-#include "common/Communication.h"
 #include "analysis/analysis.h"
 #include "analysis/distance.h"
 #include "analysis/Minkowski.h"
-
-#include "shared_ptr.h"
+#include "common/Domain.h"
+#include "common/Communication.h"
 #include "common/Utilities.h"
 #include "common/MPI_Helpers.h"
 #include "IO/MeshDatabase.h"
diff --git a/cmake/SharedPtr.cmake b/cmake/SharedPtr.cmake
deleted file mode 100644
index 9f610a98..00000000
--- a/cmake/SharedPtr.cmake
+++ /dev/null
@@ -1,170 +0,0 @@
-# Create a shared_ptr.h file in the include directory that contains 
-#    a shared_ptr class (hopefully typedef to a compiler basic)
-# Arguements:
-#    INSTALL_DIR - Directory to install shared_ptr.h
-#    NAMESPACE - Namespace to contain the shared_ptr class (may be empty)
-INCLUDE( CheckCXXSourceCompiles )
-FUNCTION( CONFIGURE_SHARED_PTR INSTALL_DIR NAMESPACE )
-    SET( CMAKE_REQUIRED_FLAGS ${CMAKE_CXX_FLAGS} )
-    CHECK_CXX_SOURCE_COMPILES(
-	    "   #include <memory>
-            namespace ${NAMESPACE} { using std::shared_ptr; }
-	        int main() {
-	            ${NAMESPACE}::shared_ptr<int> ptr;
-	            return 0;
-	        }
-	    "
-	    MEMORY_SHARED_PTR )
-    CHECK_CXX_SOURCE_COMPILES(
-	    "   #include <memory>
-            namespace ${NAMESPACE} { using std::tr1::shared_ptr; }
-	        int main() {
-	            ${NAMESPACE}::shared_ptr<int> ptr;
-	            return 0;
-	        }
-	    "
-	    MEMORY_TR1_SHARED_PTR )
-    CHECK_CXX_SOURCE_COMPILES(
-	    "   #include <tr1/memory>
-            namespace  ${NAMESPACE} { using std::tr1::shared_ptr; }
-	        int main() {
-	            ${NAMESPACE}::shared_ptr<int> ptr;
-	            return 0;
-	        }
-	    "
-	    TR1_MEMORY_TR1_SHARED_PTR )
-    GET_DIRECTORY_PROPERTY( dirs INCLUDE_DIRECTORIES )
-    SET( CMAKE_REQUIRED_FLAGS "${CMAKE_CXX_FLAGS}" )
-    SET( CMAKE_REQUIRED_INCLUDES ${dirs} "${BOOST_INCLUDE}" )
-    CHECK_CXX_SOURCE_COMPILES(
-	    "   #include \"boost/shared_ptr.hpp\"
-            namespace  ${NAMESPACE} { using boost::shared_ptr; }
-	        int main() {
-	            ${NAMESPACE}::shared_ptr<int> ptr;
-	            return 0;
-	        }
-	    "
-	    BOOST_SHARED_PTR )
-    WRITE_DUMMY_SHARED_PTR( "${NAMESPACE}" "${CMAKE_CURRENT_BINARY_DIR}/tmp/dummy_shared_ptr.h" )
-    CHECK_CXX_SOURCE_COMPILES(
-	    "   #include <iostream>
-	        #include \"${CMAKE_CURRENT_BINARY_DIR}/tmp/dummy_shared_ptr.h\"
-	        int main() {
-	            ${NAMESPACE}::shared_ptr<int> ptr;
-	            return 0;
-	        }
-	    "
-	    DUMMY_SHARED_PTR )
-    IF ( NOT NAMESPACE )
-        SET( NAMESPACE " " )
-    ENDIF()
-    IF ( BOOST_SHARED_PTR )
-        FILE(WRITE  "${CMAKE_CURRENT_BINARY_DIR}/tmp/shared_ptr.h" "#include \"boost/shared_ptr.hpp\"\n")
-        FILE(APPEND "${CMAKE_CURRENT_BINARY_DIR}/tmp/shared_ptr.h" "#include \"boost/weak_ptr.hpp\"\n")
-        FILE(APPEND "${CMAKE_CURRENT_BINARY_DIR}/tmp/shared_ptr.h" "#include \"boost/enable_shared_from_this.hpp\"\n")
-        FILE(APPEND "${CMAKE_CURRENT_BINARY_DIR}/tmp/shared_ptr.h" "namespace ${NAMESPACE} {\n")
-        FILE(APPEND "${CMAKE_CURRENT_BINARY_DIR}/tmp/shared_ptr.h" "    using boost::shared_ptr; \n")
-        FILE(APPEND "${CMAKE_CURRENT_BINARY_DIR}/tmp/shared_ptr.h" "    using boost::dynamic_pointer_cast; \n")
-        FILE(APPEND "${CMAKE_CURRENT_BINARY_DIR}/tmp/shared_ptr.h" "    using boost::const_pointer_cast; \n")
-        FILE(APPEND "${CMAKE_CURRENT_BINARY_DIR}/tmp/shared_ptr.h" "    using boost::weak_ptr; \n")
-        FILE(APPEND "${CMAKE_CURRENT_BINARY_DIR}/tmp/shared_ptr.h" "    using boost::enable_shared_from_this; \n")
-        FILE(APPEND "${CMAKE_CURRENT_BINARY_DIR}/tmp/shared_ptr.h" "}\n")
-    ELSEIF ( MEMORY_SHARED_PTR )
-        IF ( ${NAMESPACE} STREQUAL "std" )
-            FILE(WRITE  "${CMAKE_CURRENT_BINARY_DIR}/tmp/shared_ptr.h" "#include <memory>\n")
-        ELSE()
-            FILE(WRITE  "${CMAKE_CURRENT_BINARY_DIR}/tmp/shared_ptr.h" "#include <memory>\n")
-            FILE(APPEND "${CMAKE_CURRENT_BINARY_DIR}/tmp/shared_ptr.h" "namespace ${NAMESPACE} {\n")
-            FILE(APPEND "${CMAKE_CURRENT_BINARY_DIR}/tmp/shared_ptr.h" "    using std::shared_ptr; \n")
-            FILE(APPEND "${CMAKE_CURRENT_BINARY_DIR}/tmp/shared_ptr.h" "    using std::dynamic_pointer_cast; \n")
-            FILE(APPEND "${CMAKE_CURRENT_BINARY_DIR}/tmp/shared_ptr.h" "    using std::const_pointer_cast; \n")
-            FILE(APPEND "${CMAKE_CURRENT_BINARY_DIR}/tmp/shared_ptr.h" "    using std::weak_ptr; \n")
-            FILE(APPEND "${CMAKE_CURRENT_BINARY_DIR}/tmp/shared_ptr.h" "    using std::enable_shared_from_this; \n")
-            FILE(APPEND "${CMAKE_CURRENT_BINARY_DIR}/tmp/shared_ptr.h" "}\n")
-        ENDIF()
-    ELSEIF ( MEMORY_TR1_SHARED_PTR )
-        FILE(WRITE  "${CMAKE_CURRENT_BINARY_DIR}/tmp/shared_ptr.h" "#include <memory>\n")
-        FILE(APPEND "${CMAKE_CURRENT_BINARY_DIR}/tmp/shared_ptr.h" "namespace ${NAMESPACE} {\n")
-        FILE(APPEND "${CMAKE_CURRENT_BINARY_DIR}/tmp/shared_ptr.h" "    using std::tr1::shared_ptr; \n")
-        FILE(APPEND "${CMAKE_CURRENT_BINARY_DIR}/tmp/shared_ptr.h" "    using std::tr1::dynamic_pointer_cast; \n")
-        FILE(APPEND "${CMAKE_CURRENT_BINARY_DIR}/tmp/shared_ptr.h" "    using std::tr1::const_pointer_cast; \n")
-        FILE(APPEND "${CMAKE_CURRENT_BINARY_DIR}/tmp/shared_ptr.h" "    using std::tr1::weak_ptr; \n")
-        FILE(APPEND "${CMAKE_CURRENT_BINARY_DIR}/tmp/shared_ptr.h" "    using std::tr1::enable_shared_from_this; \n")
-        FILE(APPEND "${CMAKE_CURRENT_BINARY_DIR}/tmp/shared_ptr.h" "}\n")
-    ELSEIF ( TR1_MEMORY_TR1_SHARED_PTR )
-        FILE(WRITE  "${CMAKE_CURRENT_BINARY_DIR}/tmp/shared_ptr.h" "#include <tr1/memory>\n")
-        FILE(APPEND "${CMAKE_CURRENT_BINARY_DIR}/tmp/shared_ptr.h" "namespace ${NAMESPACE} {\n")
-        FILE(APPEND "${CMAKE_CURRENT_BINARY_DIR}/tmp/shared_ptr.h" "    using std::tr1::shared_ptr; \n")
-        FILE(APPEND "${CMAKE_CURRENT_BINARY_DIR}/tmp/shared_ptr.h" "    using std::tr1::dynamic_pointer_cast; \n")
-        FILE(APPEND "${CMAKE_CURRENT_BINARY_DIR}/tmp/shared_ptr.h" "    using std::tr1::const_pointer_cast; \n")
-        FILE(APPEND "${CMAKE_CURRENT_BINARY_DIR}/tmp/shared_ptr.h" "    using std::tr1::weak_ptr; \n")
-        FILE(APPEND "${CMAKE_CURRENT_BINARY_DIR}/tmp/shared_ptr.h" "    using std::tr1::enable_shared_from_this; \n")
-        FILE(APPEND "${CMAKE_CURRENT_BINARY_DIR}/tmp/shared_ptr.h" "}\n")
-    ELSEIF ( DUMMY_SHARED_PTR ) 
-        MESSAGE("Warning: No valid shared_ptr found, using dummy shared_ptr" )
-        WRITE_DUMMY_SHARED_PTR( "${NAMESPACE}" "${CMAKE_CURRENT_BINARY_DIR}/tmp/shared_ptr.h" )
-    ELSE()
-        MESSAGE(FATAL_ERROR "No shared_ptr availible")
-    ENDIF()
-    EXECUTE_PROCESS( COMMAND ${CMAKE_COMMAND} -E copy_if_different 
-        "${CMAKE_CURRENT_BINARY_DIR}/tmp/shared_ptr.h" "${INSTALL_DIR}/shared_ptr.h" )
-ENDFUNCTION()
-
-
-FUNCTION( WRITE_DUMMY_SHARED_PTR NAMESPACE FILENAME )
-    FILE(WRITE  "${FILENAME}" "#ifndef DUMMY_SHARED_PTR_INC\n")
-    FILE(APPEND "${FILENAME}" "#define DUMMY_SHARED_PTR_INC\n")
-    FILE(APPEND "${FILENAME}" "namespace dummy {\n\n")
-    FILE(APPEND "${FILENAME}" "template<class T> void DefaultDeleter(T* p) {delete p;}\n\n")
-    FILE(APPEND "${FILENAME}" "template<class T> class shared_ptr {\n")
-    FILE(APPEND "${FILENAME}" "public:\n")
-    FILE(APPEND "${FILENAME}" "    typedef void (*D)(T*);\n")
-    FILE(APPEND "${FILENAME}" "    shared_ptr( ): obj(NULL), deleter(DefaultDeleter<T>), count(NULL) {}\n")
-    FILE(APPEND "${FILENAME}" "    shared_ptr( T *ptr, void (*D)(T*)=DefaultDeleter<T>):\n")
-    FILE(APPEND "${FILENAME}" "        obj(ptr), deleter(D), count(NULL) { if (ptr) { count = new int; (*count)=1; } } \n")
-    FILE(APPEND "${FILENAME}" "    shared_ptr( const shared_ptr<T>& rhs ):  \n")
-    FILE(APPEND "${FILENAME}" "        obj(rhs.get()), deleter(reinterpret_cast<D>(rhs.deleter)), count(rhs.count) { if ( count!=NULL ) { ++(*count); } } \n")
-    FILE(APPEND "${FILENAME}" "    template<class U> shared_ptr( const shared_ptr<U>& rhs ):  \n")
-    FILE(APPEND "${FILENAME}" "        obj(rhs.get()), deleter(reinterpret_cast<D>(rhs.deleter)), count(rhs.count) { if ( count!=NULL ) { ++(*count); } } \n")
-    FILE(APPEND "${FILENAME}" "    shared_ptr& operator=( const shared_ptr<T>& rhs )\n")
-    FILE(APPEND "${FILENAME}" "        { if (this==&rhs) { return *this;} reset(); obj=rhs.obj; deleter=reinterpret_cast<D>(rhs.deleter); count=rhs.count; ++(*count); return *this; } \n")
-    FILE(APPEND "${FILENAME}" "    ~shared_ptr( ) { reset(); }\n")
-    FILE(APPEND "${FILENAME}" "    void reset( T *ptr ) { reset(); obj=ptr; count=new int; (*count)=1; }\n")
-    FILE(APPEND "${FILENAME}" "    void reset( void ) { \n")
-    FILE(APPEND "${FILENAME}" "        if ( count!=NULL) { int tmp=--(*count); if ( tmp==0 ) { deleter(obj); delete count; } } \n")
-    FILE(APPEND "${FILENAME}" "        obj=NULL; count=NULL; \n")
-    FILE(APPEND "${FILENAME}" "    }\n")
-    FILE(APPEND "${FILENAME}" "    T* get( ) const { return obj; } \n")
-    FILE(APPEND "${FILENAME}" "    T* operator->( ) const { return obj; } \n")
-    FILE(APPEND "${FILENAME}" "    const T& operator*( ) const { return *obj; } \n")
-    FILE(APPEND "${FILENAME}" "    bool operator==( const T * rhs ) const { return obj==rhs; } \n")
-    FILE(APPEND "${FILENAME}" "    bool operator!=( const T * rhs ) const { return obj!=rhs; } \n")
-    FILE(APPEND "${FILENAME}" "protected:\n")
-    FILE(APPEND "${FILENAME}" "    T *obj;\n")
-    FILE(APPEND "${FILENAME}" "    void (*deleter)(T*);\n")
-    FILE(APPEND "${FILENAME}" "    volatile int *count;\n")
-    FILE(APPEND "${FILENAME}" "template<class T1, class U> friend shared_ptr<T1> dynamic_pointer_cast( shared_ptr<U> const & );\n")
-    FILE(APPEND "${FILENAME}" "template<class T1, class U> friend shared_ptr<T1> const_pointer_cast( shared_ptr<U> const & );\n")
-    FILE(APPEND "${FILENAME}" "template<class Y> friend class shared_ptr;\n")
-    FILE(APPEND "${FILENAME}" "};\n\n")
-    FILE(APPEND "${FILENAME}" "template<class T, class U> shared_ptr<T> dynamic_pointer_cast( shared_ptr<U> const & rhs ) {\n")
-    FILE(APPEND "${FILENAME}" "    T* obj = dynamic_cast<T*>(rhs.obj);\n")
-    FILE(APPEND "${FILENAME}" "    shared_ptr<T> ptr;\n")
-    FILE(APPEND "${FILENAME}" "    if ( obj!=NULL ) { ptr.obj = obj; ptr.count=rhs.count; ++(*ptr.count); }\n")
-    FILE(APPEND "${FILENAME}" "    return ptr;\n}\n")
-    FILE(APPEND "${FILENAME}" "template<class T, class U> shared_ptr<T> const_pointer_cast( shared_ptr<U> const & rhs ) {\n")
-    FILE(APPEND "${FILENAME}" "    T* obj = const_cast<T*>(rhs.obj);\n")
-    FILE(APPEND "${FILENAME}" "    shared_ptr<T> ptr;\n")
-    FILE(APPEND "${FILENAME}" "    if ( obj!=NULL ) { ptr.obj = obj; ptr.count=rhs.count; ++(*ptr.count); }\n")
-    FILE(APPEND "${FILENAME}" "    return ptr;\n}\n")
-    FILE(APPEND "${FILENAME}" "\n} // namespace dummy\n")
-    FILE(APPEND "${FILENAME}" "\n\n")
-    FILE(APPEND "${FILENAME}" "namespace ${NAMESPACE} {\n")
-    FILE(APPEND "${FILENAME}" "    using dummy::shared_ptr; \n")
-    FILE(APPEND "${FILENAME}" "    using dummy::dynamic_pointer_cast; \n")
-    FILE(APPEND "${FILENAME}" "    using dummy::const_pointer_cast; \n")
-    FILE(APPEND "${FILENAME}" "}\n\n")
-    FILE(APPEND "${FILENAME}" "#endif\n")
-ENDFUNCTION()
-
-
diff --git a/common/Array.hpp b/common/Array.hpp
index 6b2dd16f..fe915ff7 100644
--- a/common/Array.hpp
+++ b/common/Array.hpp
@@ -1117,9 +1117,8 @@ Array<TYPE, FUN, Allocator> Array<TYPE, FUN, Allocator>::cat( const std::vector<
  *  Interpolate                                          *
  ********************************************************/
 template<class T>
-struct is_compatible_double : std::integral_constant<bool,
-                                  std::is_floating_point<typename std::remove_cv<T>::type>::value ||
-                                      std::is_integral<typename std::remove_cv<T>::type>::value> {
+struct is_compatible_double
+    : std::integral_constant<bool, std::is_floating_point<T>::value || std::is_integral<T>::value> {
 };
 template<class TYPE>
 inline typename std::enable_if<is_compatible_double<TYPE>::value, TYPE>::type Array_interp_1D(
diff --git a/common/MPI_Helpers.cpp b/common/MPI_Helpers.cpp
index 23924f21..736a2f02 100644
--- a/common/MPI_Helpers.cpp
+++ b/common/MPI_Helpers.cpp
@@ -36,7 +36,7 @@ template<> MPI_Datatype getMPItype<double>() {
 ********************************************************/
 // unsigned char
 template<>
-size_t packsize<unsigned char>( const unsigned char& rhs )
+size_t packsize<unsigned char>( const unsigned char& )
 {
     return sizeof(unsigned char);
 }
@@ -52,7 +52,7 @@ void unpack<unsigned char>( unsigned char& data, const char *buffer )
 }
 // char
 template<>
-size_t packsize<char>( const char& rhs )
+size_t packsize<char>( const char& )
 {
     return sizeof(char);
 }
@@ -68,7 +68,7 @@ void unpack<char>( char& data, const char *buffer )
 }
 // int
 template<>
-size_t packsize<int>( const int& rhs )
+size_t packsize<int>( const int& )
 {
     return sizeof(int);
 }
@@ -84,7 +84,7 @@ void unpack<int>( int& data, const char *buffer )
 }
 // unsigned int
 template<>
-size_t packsize<unsigned int>( const unsigned int& rhs )
+size_t packsize<unsigned int>( const unsigned int& )
 {
     return sizeof(unsigned int);
 }
@@ -100,7 +100,7 @@ void unpack<unsigned int>( unsigned int& data, const char *buffer )
 }
 // size_t
 template<>
-size_t packsize<size_t>( const size_t& rhs )
+size_t packsize<size_t>( const size_t& )
 {
     return sizeof(size_t);
 }
diff --git a/common/StackTrace.cpp b/common/StackTrace.cpp
deleted file mode 100644
index 8b9e4015..00000000
--- a/common/StackTrace.cpp
+++ /dev/null
@@ -1,1876 +0,0 @@
-#include "common/StackTrace.h"
-
-#include <algorithm>
-#include <csignal>
-#include <cstring>
-#include <iostream>
-#include <map>
-#include <memory>
-#include <mutex>
-#include <random>
-#include <set>
-#include <sstream>
-#include <stdexcept>
-#include <thread>
-
-
-#define perr std::cerr
-
-
-// Detect the OS
-// clang-format off
-#if defined( WIN32 ) || defined( _WIN32 ) || defined( WIN64 ) || defined( _WIN64 ) || defined( _MSC_VER )
-    #define USE_WINDOWS
-    #define NOMINMAX
-#elif defined( __APPLE__ )
-    #define USE_MAC
-    #define USE_NM
-#elif defined( __linux ) || defined( __linux__ ) || defined( __unix ) || defined( __posix )
-    #define USE_LINUX
-    #define USE_NM
-#else
-    #error Unknown OS
-#endif
-// clang-format on
-
-
-// Include system dependent headers
-// clang-format off
-// Detect the OS and include system dependent headers
-#ifdef USE_WINDOWS
-    #include <windows.h>
-    #include <dbghelp.h>
-    #include <DbgHelp.h>
-    #include <TlHelp32.h>
-    #include <Psapi.h>
-    #include <process.h>
-    #include <stdio.h>
-    #include <tchar.h>
-    #pragma comment( lib, "version.lib" ) // for "VerQueryValue"
-#else
-    #include <dlfcn.h>
-    #include <execinfo.h>
-    #include <sched.h>
-    #include <sys/time.h>
-    #include <ctime>
-    #include <unistd.h>
-    #include <sys/syscall.h>
-#endif
-#ifdef USE_MAC
-    #include <mach-o/dyld.h>
-    #include <mach/mach.h>
-    #include <sys/sysctl.h>
-    #include <sys/types.h>
-#endif
-// clang-format on
-
-
-#ifdef __GNUC__
-#define USE_ABI
-#include <cxxabi.h>
-#endif
-
-
-#ifndef NULL_USE
-#define NULL_USE( variable )                 \
-    do {                                     \
-        if ( 0 ) {                           \
-            char *temp = (char *) &variable; \
-            temp++;                          \
-        }                                    \
-    } while ( 0 )
-#endif
-
-
-// Set the callstack signal
-#ifdef SIGRTMIN
-#define CALLSTACK_SIG SIGRTMIN + 4
-#else
-#define CALLSTACK_SIG SIGUSR1
-#define SIGRTMIN SIGUSR1
-#define SIGRTMAX SIGUSR1
-#endif
-
-
-// Helper thread
-static std::shared_ptr<std::thread> globalMonitorThread;
-
-
-// Utility to break a string by a newline
-static inline std::vector<std::string> breakString( const std::string &str )
-{
-    std::vector<std::string> strvec;
-    size_t i1 = 0;
-    size_t i2 = std::min( str.find( '\n', i1 ), str.length() );
-    while ( i1 < str.length() ) {
-        strvec.push_back( str.substr( i1, i2 - i1 ) );
-        i1 = i2 + 1;
-        i2 = std::min( str.find( '\n', i1 ), str.length() );
-    }
-    return strvec;
-}
-
-
-// Function to replace all instances of a string with another
-static inline void strrep( std::string &str, const std::string &s, const std::string &r )
-{
-    size_t i = 0;
-    while ( i < str.length() ) {
-        i = str.find( s, i );
-        if ( i == std::string::npos ) {
-            break;
-        }
-        str.replace( i, s.length(), r );
-        i += r.length();
-    }
-}
-
-
-// Utility to strip the path from a filename
-static inline std::string stripPath( const std::string &filename )
-{
-    if ( filename.empty() )
-        return std::string();
-    int i = 0;
-    for ( i = (int) filename.size() - 1; i >= 0 && filename[i] != 47 && filename[i] != 92; i-- ) {
-    }
-    i = std::max( 0, i + 1 );
-    return filename.substr( i );
-}
-
-
-// Inline function to subtract two addresses returning the absolute difference
-static inline void *subtractAddress( void *a, void *b )
-{
-    return reinterpret_cast<void *>(
-        std::abs( reinterpret_cast<long long int>( a ) - reinterpret_cast<long long int>( b ) ) );
-}
-
-
-#ifdef USE_WINDOWS
-static BOOL __stdcall readProcMem( HANDLE hProcess,
-                                   DWORD64 qwBaseAddress,
-                                   PVOID lpBuffer,
-                                   DWORD nSize,
-                                   LPDWORD lpNumberOfBytesRead )
-{
-    SIZE_T st;
-    BOOL bRet = ReadProcessMemory( hProcess, (LPVOID) qwBaseAddress, lpBuffer, nSize, &st );
-    *lpNumberOfBytesRead = (DWORD) st;
-    return bRet;
-}
-static inline std::string getCurrentDirectory()
-{
-    char temp[1024] = { 0 };
-    GetCurrentDirectoryA( sizeof( temp ), temp );
-    return temp;
-}
-namespace StackTrace {
-BOOL GetModuleListTH32( HANDLE hProcess, DWORD pid );
-BOOL GetModuleListPSAPI( HANDLE hProcess );
-DWORD LoadModule( HANDLE hProcess, LPCSTR img, LPCSTR mod, DWORD64 baseAddr, DWORD size );
-void LoadModules();
-}; // namespace StackTrace
-#endif
-
-
-// Functions to copy data
-static inline char *copy_in( size_t N, const void *data, char *ptr )
-{
-    memcpy( ptr, data, N );
-    return ptr + N;
-}
-static inline const char *copy_out( size_t N, void *data, const char *ptr )
-{
-    memcpy( data, ptr, N );
-    return ptr + N;
-}
-
-
-/****************************************************************************
- *  Utility to call system command and return output                         *
- ****************************************************************************/
-#ifdef USE_WINDOWS
-#define popen _popen
-#define pclose _pclose
-#endif
-std::string StackTrace::exec( const std::string &cmd, int &code )
-{
-    signal( SIGCHLD, SIG_DFL ); // Clear child exited
-    FILE *pipe = popen( cmd.c_str(), "r" );
-    if ( pipe == nullptr )
-        return std::string();
-    std::string result = "";
-    result.reserve( 1024 );
-    while ( !feof( pipe ) ) {
-        char buffer[257];
-        buffer[256] = 0;
-        if ( fgets( buffer, 128, pipe ) != nullptr )
-            result += buffer;
-    }
-    auto status = pclose( pipe );
-    code        = WEXITSTATUS( status );
-    return result;
-}
-
-
-/****************************************************************************
- *  stack_info                                                               *
- ****************************************************************************/
-void StackTrace::stack_info::clear()
-{
-    address  = nullptr;
-    address2 = nullptr;
-    object.clear();
-    function.clear();
-    filename.clear();
-    line = -1;
-}
-bool StackTrace::stack_info::operator==( const StackTrace::stack_info &rhs ) const
-{
-    if ( address == rhs.address )
-        return true;
-    if ( address2 == rhs.address2 && object == rhs.object )
-        return true;
-    return false;
-}
-bool StackTrace::stack_info::operator!=( const StackTrace::stack_info &rhs ) const
-{
-    return !operator==( rhs );
-}
-int StackTrace::stack_info::getAddressWidth() const
-{
-    auto addr = reinterpret_cast<unsigned long long int>( address );
-    if ( addr <= 0xFFFF )
-        return 4;
-    if ( addr <= 0xFFFFFFFF )
-        return 8;
-    if ( addr <= 0xFFFFFFFFFFFF )
-        return 12;
-    return 16;
-}
-std::string
-StackTrace::stack_info::print( int widthAddress, int widthObject, int widthFunction ) const
-{
-    char tmp1[64], tmp2[64];
-    sprintf( tmp1, "0x%%0%illx:  ", widthAddress );
-    sprintf( tmp2, tmp1, reinterpret_cast<unsigned long long int>( address ) );
-    std::string stack( tmp2 );
-    sprintf( tmp2, "%i", line );
-    std::string line_str( tmp2 );
-    size_t N = stack.length();
-    stack += stripPath( object );
-    stack.resize( std::max<size_t>( stack.size(), N + widthObject ), ' ' );
-    N = stack.length() + 2;
-    stack += "  " + function;
-    if ( !filename.empty() && line > 0 ) {
-        stack.resize( std::max<size_t>( stack.size(), N + widthFunction ), ' ' );
-        stack += "  " + stripPath( filename ) + ":" + line_str;
-    } else if ( !filename.empty() ) {
-        stack.resize( std::max<size_t>( stack.size(), N + widthFunction ), ' ' );
-        stack += "  " + stripPath( filename );
-    } else if ( line > 0 ) {
-        stack += " : " + line_str;
-    }
-    return stack;
-}
-size_t StackTrace::stack_info::size() const
-{
-    return 2 * sizeof( void * ) + 4 * sizeof( int ) + object.size() + function.size() +
-           filename.size();
-}
-char *StackTrace::stack_info::pack( char *ptr ) const
-{
-    int Nobj  = object.size();
-    int Nfun  = function.size();
-    int Nfile = filename.size();
-    ptr       = copy_in( sizeof( void * ), &address, ptr );
-    ptr       = copy_in( sizeof( void * ), &address2, ptr );
-    ptr       = copy_in( sizeof( int ), &Nobj, ptr );
-    ptr       = copy_in( sizeof( int ), &Nfun, ptr );
-    ptr       = copy_in( sizeof( int ), &Nfile, ptr );
-    ptr       = copy_in( sizeof( int ), &line, ptr );
-    ptr       = copy_in( Nobj, object.data(), ptr );
-    ptr       = copy_in( Nfun, function.data(), ptr );
-    ptr       = copy_in( Nfile, filename.data(), ptr );
-    return ptr;
-}
-const char *StackTrace::stack_info::unpack( const char *ptr )
-{
-    int Nobj, Nfun, Nfile;
-    ptr = copy_out( sizeof( void * ), &address, ptr );
-    ptr = copy_out( sizeof( void * ), &address2, ptr );
-    ptr = copy_out( sizeof( int ), &Nobj, ptr );
-    ptr = copy_out( sizeof( int ), &Nfun, ptr );
-    ptr = copy_out( sizeof( int ), &Nfile, ptr );
-    ptr = copy_out( sizeof( int ), &line, ptr );
-    object.resize( Nobj );
-    function.resize( Nfun );
-    filename.resize( Nfile );
-    ptr = copy_out( Nobj, &object.front(), ptr );
-    ptr = copy_out( Nfun, &function.front(), ptr );
-    ptr = copy_out( Nfile, &filename.front(), ptr );
-    return ptr;
-}
-std::vector<char> StackTrace::stack_info::packArray( const std::vector<stack_info> &data )
-{
-    size_t size = sizeof( int );
-    for ( const auto &i : data )
-        size += i.size();
-    std::vector<char> vec( size, 0 );
-    char *ptr = vec.data();
-    int N     = data.size();
-    ptr       = copy_in( sizeof( int ), &N, ptr );
-    for ( const auto &i : data )
-        ptr = i.pack( ptr );
-    return vec;
-}
-std::vector<StackTrace::stack_info> StackTrace::stack_info::unpackArray( const char *ptr )
-{
-    int N;
-    ptr = copy_out( sizeof( int ), &N, ptr );
-    std::vector<stack_info> data( N );
-    for ( auto &i : data )
-        ptr = i.unpack( ptr );
-    return data;
-}
-#ifdef USE_MPI
-static std::vector<char> pack( const std::vector<std::vector<StackTrace::stack_info>> &data )
-{
-    size_t size = sizeof( int );
-    for ( const auto &i : data ) {
-        size += sizeof( int );
-        for ( size_t j = 0; j < i.size(); j++ )
-            size += i[j].size();
-    }
-    std::vector<char> out( size, 0 );
-    char *ptr = out.data();
-    int N     = data.size();
-    ptr       = copy_in( sizeof( int ), &N, ptr );
-    for ( int i = 0; i < N; i++ ) {
-        int M = data[i].size();
-        ptr   = copy_in( sizeof( int ), &M, ptr );
-        for ( int j = 0; j < M; j++ )
-            ptr = data[i][j].pack( ptr );
-    }
-    return out;
-}
-static std::vector<std::vector<StackTrace::stack_info>> unpack( const std::vector<char> &in )
-{
-    const char *ptr = in.data();
-    int N;
-    ptr = copy_out( sizeof( int ), &N, ptr );
-    std::vector<std::vector<StackTrace::stack_info>> data( N );
-    for ( int i = 0; i < N; i++ ) {
-        int M;
-        ptr = copy_out( sizeof( int ), &M, ptr );
-        data[i].resize( M );
-        for ( int j = 0; j < M; j++ )
-            ptr = data[i][j].unpack( ptr );
-    }
-    return data;
-}
-#endif
-
-
-/****************************************************************************
- *  multi_stack_info                                                         *
- ****************************************************************************/
-StackTrace::multi_stack_info::multi_stack_info( const std::vector<stack_info> &rhs )
-{
-    operator=( rhs );
-}
-StackTrace::multi_stack_info &StackTrace::multi_stack_info::
-operator=( const std::vector<stack_info> &rhs )
-{
-    clear();
-    if ( rhs.empty() )
-        return *this;
-    N     = 1;
-    stack = rhs[0];
-    if ( rhs.size() > 1 )
-        add( rhs.size() - 1, &rhs[1] );
-    return *this;
-}
-void StackTrace::multi_stack_info::clear()
-{
-    N = 0;
-    stack.clear();
-    children.clear();
-}
-void StackTrace::multi_stack_info::print2( const std::string &prefix,
-                                           int w[3],
-                                           std::vector<std::string> &text ) const
-{
-    if ( stack == stack_info() ) {
-        for ( const auto &child : children )
-            child.print2( "", w, text );
-        return;
-    }
-    std::string line = prefix + "[" + std::to_string( N ) + "] " + stack.print( w[0], w[1], w[2] );
-    text.push_back( line );
-    std::string prefix2 = prefix + "  ";
-    for ( size_t i = 0; i < children.size(); i++ ) {
-        const auto &child = children[i];
-        std::vector<std::string> text2;
-        child.print2( "", w, text2 );
-        for ( size_t j = 0; j < text2.size(); j++ ) {
-            std::string line = prefix2 + text2[j];
-            if ( children.size() > 1 && j > 0 && i < children.size() - 1 )
-                line[prefix2.size()] = '|';
-            text.push_back( line );
-        }
-    }
-}
-std::vector<std::string> StackTrace::multi_stack_info::print( const std::string &prefix ) const
-{
-    std::vector<std::string> text;
-    int w[3] = { 0 };
-    w[0]     = getAddressWidth();
-    w[1]     = getObjectWidth();
-    w[2]     = getFunctionWidth();
-    print2( prefix, w, text );
-    return text;
-}
-int StackTrace::multi_stack_info::getAddressWidth() const
-{
-    int w = stack.getAddressWidth();
-    for ( const auto &child : children )
-        w = std::max( w, child.getAddressWidth() );
-    return w;
-}
-int StackTrace::multi_stack_info::getObjectWidth() const
-{
-    int w = std::min<int>( stripPath( stack.object ).size() + 1, 20 );
-    for ( const auto &child : children )
-        w = std::max( w, child.getObjectWidth() );
-    return w;
-}
-int StackTrace::multi_stack_info::getFunctionWidth() const
-{
-    int w = std::min<int>( stack.function.size() + 1, 40 );
-    for ( const auto &child : children )
-        w = std::max( w, child.getFunctionWidth() );
-    return w;
-}
-void StackTrace::multi_stack_info::add( size_t len, const stack_info *stack )
-{
-    if ( len == 0 )
-        return;
-    const auto &s = stack[len - 1];
-    for ( auto &i : children ) {
-        if ( i.stack == s ) {
-            i.N++;
-            if ( len > 1 )
-                i.add( len - 1, stack );
-            return;
-        }
-    }
-    children.resize( children.size() + 1 );
-    children.back().N     = 1;
-    children.back().stack = s;
-    if ( len > 1 )
-        children.back().add( len - 1, stack );
-}
-
-
-/****************************************************************************
- *  Function to find an entry                                                *
- ****************************************************************************/
-template <class TYPE>
-inline size_t findfirst( const std::vector<TYPE> &X, TYPE Y )
-{
-    if ( X.empty() )
-        return 0;
-    size_t lower = 0;
-    size_t upper = X.size() - 1;
-    if ( X[lower] >= Y )
-        return lower;
-    if ( X[upper] < Y )
-        return upper;
-    while ( ( upper - lower ) != 1 ) {
-        size_t value = ( upper + lower ) / 2;
-        if ( X[value] >= Y )
-            upper = value;
-        else
-            lower = value;
-    }
-    return upper;
-}
-
-
-/****************************************************************************
- *  Function to get the executable name                                      *
- ****************************************************************************/
-static char global_exe_name[1000] = { 0 };
-static bool setGlobalExecutableName( char *exe )
-{
-    try {
-#ifdef USE_LINUX
-        auto *buf = new char[0x10000];
-        int len   = ::readlink( "/proc/self/exe", buf, 0x10000 );
-        if ( len != -1 ) {
-            buf[len] = '\0';
-            strcpy( exe, buf );
-        }
-        delete[] buf;
-#elif defined( USE_MAC )
-        uint32_t size = 0x10000;
-        char *buf     = new char[size];
-        memset( buf, 0, size );
-        if ( _NSGetExecutablePath( buf, &size ) == 0 )
-            strcpy( exe, buf );
-        delete[] buf;
-#elif defined( USE_WINDOWS )
-        DWORD size = 0x10000;
-        char *buf  = new char[size];
-        memset( buf, 0, size );
-        GetModuleFileName( nullptr, buf, size );
-        strcpy( exe, buf );
-        delete[] buf;
-#endif
-    } catch ( ... ) {
-    }
-    return true;
-}
-static bool global_exe_name_set = setGlobalExecutableName( global_exe_name );
-std::string StackTrace::getExecutable()
-{
-    if ( !global_exe_name_set )
-        global_exe_name_set = setGlobalExecutableName( global_exe_name );
-    return std::string( global_exe_name );
-}
-
-
-/****************************************************************************
- * Function to get symbols for the executable from nm (if availible)         *
- * Note: this function maintains an internal cached copy to prevent          *
- *    exccessive calls to nm.  This function also uses a lock to ensure      *
- *    thread safety.                                                         *
- ****************************************************************************/
-std::mutex getSymbols_mutex;
-struct global_symbols_struct {
-    std::vector<void *> address;
-    std::vector<char> type;
-    std::vector<std::string> obj;
-    int error;
-} global_symbols;
-static const global_symbols_struct &getSymbols2()
-{
-    static bool loaded = false;
-    static global_symbols_struct data;
-    // Load the symbol tables if they have not been loaded
-    if ( !loaded ) {
-        getSymbols_mutex.lock();
-        if ( !loaded ) {
-            loaded = true;
-#ifdef USE_NM
-            try {
-                char cmd[1024];
-#ifdef USE_LINUX
-                sprintf( cmd, "nm -n --demangle %s", global_exe_name );
-#elif defined( USE_MAC )
-                sprintf( cmd, "nm -n %s | c++filt", global_exe_name );
-#else
-#error Unknown OS using nm
-#endif
-                int code;
-                auto output = breakString( StackTrace::exec( cmd, code ) );
-                for ( const auto &line : output ) {
-                    if ( line.empty() )
-                        continue;
-                    if ( line[0] == ' ' )
-                        continue;
-                    auto *a = const_cast<char *>( line.c_str() );
-                    char *b = strchr( a, ' ' );
-                    if ( b == nullptr )
-                        continue;
-                    b[0] = 0;
-                    b++;
-                    char *c = strchr( b, ' ' );
-                    if ( c == nullptr )
-                        continue;
-                    c[0] = 0;
-                    c++;
-                    char *d = strchr( c, '\n' );
-                    if ( d )
-                        d[0] = 0;
-                    size_t add = strtoul( a, nullptr, 16 );
-                    data.address.push_back( reinterpret_cast<void *>( add ) );
-                    data.type.push_back( b[0] );
-                    data.obj.emplace_back( c );
-                }
-            } catch ( ... ) {
-                data.error = -3;
-            }
-            data.error = 0;
-#else
-            data.error = -1;
-#endif
-        }
-        getSymbols_mutex.unlock();
-    }
-    return data;
-}
-int StackTrace::getSymbols( std::vector<void *> &address,
-                            std::vector<char> &type,
-                            std::vector<std::string> &obj )
-{
-    const global_symbols_struct &data = getSymbols2();
-    address                           = data.address;
-    type                              = data.type;
-    obj                               = data.obj;
-    return data.error;
-}
-
-
-/****************************************************************************
- *  Function to get call stack info                                          *
- ****************************************************************************/
-#ifdef USE_MAC
-static void *loadAddress( const std::string &object )
-{
-    static std::map<std::string, void *> obj_map;
-    if ( obj_map.empty() ) {
-        uint32_t numImages = _dyld_image_count();
-        for ( uint32_t i = 0; i < numImages; i++ ) {
-            const struct mach_header *header = _dyld_get_image_header( i );
-            const char *name                 = _dyld_get_image_name( i );
-            const char *p                    = strrchr( name, '/' );
-            struct mach_header *address      = const_cast<struct mach_header *>( header );
-            obj_map.insert( std::pair<std::string, void *>( p + 1, address ) );
-            // printf("   module=%s, address=%p\n", p + 1, header);
-        }
-    }
-    auto it       = obj_map.find( object );
-    void *address = 0;
-    if ( it != obj_map.end() ) {
-        address = it->second;
-    } else {
-        it = obj_map.find( stripPath( object ) );
-        if ( it != obj_map.end() )
-            address = it->second;
-    }
-    // printf("%s: 0x%016llx\n",object.c_str(),address);
-    return address;
-}
-static std::tuple<std::string, std::string, std::string, int> split_atos( const std::string &buf )
-{
-    if ( buf.empty() )
-        return std::tuple<std::string, std::string, std::string, int>();
-    // Get the function
-    size_t index = buf.find( " (in " );
-    if ( index == std::string::npos )
-        return std::make_tuple(
-            buf.substr( 0, buf.length() - 1 ), std::string(), std::string(), 0 );
-    std::string fun = buf.substr( 0, index );
-    std::string tmp = buf.substr( index + 5 );
-    // Get the object
-    index           = tmp.find( ')' );
-    std::string obj = tmp.substr( 0, index );
-    tmp             = tmp.substr( index + 1 );
-    // Get the filename and line number
-    size_t p1 = tmp.find( '(' );
-    size_t p2 = tmp.find( ')' );
-    tmp       = tmp.substr( p1 + 1, p2 - p1 - 1 );
-    index     = tmp.find( ':' );
-    std::string file;
-    int line = 0;
-    if ( index != std::string::npos ) {
-        file = tmp.substr( 0, index );
-        line = std::stoi( tmp.substr( index + 1 ) );
-    } else if ( p1 != std::string::npos ) {
-        file = tmp;
-    }
-    return std::make_tuple( fun, obj, file, line );
-}
-#endif
-#ifdef USE_LINUX
-using uint_p = uint64_t;
-#elif defined( USE_MAC )
-typedef unsigned long uint_p;
-#endif
-#if defined( USE_LINUX ) || defined( USE_MAC )
-static inline std::string generateCmd( const std::string &s1,
-                                       const std::string &s2,
-                                       const std::string &s3,
-                                       std::vector<void *> addresses,
-                                       const std::string &s4 )
-{
-    std::string cmd = s1 + s2 + s3;
-    for ( auto &addresse : addresses ) {
-        char tmp[32];
-        sprintf( tmp, "%lx ", reinterpret_cast<uint_p>( addresse ) );
-        cmd += tmp;
-    }
-    cmd += s4;
-    return cmd;
-}
-#endif
-// clang-format off
-static void getFileAndLineObject( std::vector<StackTrace::stack_info*> &info )
-{
-    if ( info.empty() )
-        return;
-    // This gets the file and line numbers for multiple stack lines in the same object
-    #if defined( USE_LINUX )
-        // Create the call command
-        std::vector<void*> address_list(info.size(),nullptr);
-        for (size_t i=0; i<info.size(); i++) {
-            address_list[i] = info[i]->address;
-            if ( info[i]->object.find( ".so" ) != std::string::npos )
-                address_list[i] = info[i]->address2; 
-            if ( info[i]->object.find( ".mexa64" ) != std::string::npos )
-                address_list[i] = info[i]->address2; 
-        }
-        std::string cmd = generateCmd( "addr2line -C -e ", info[0]->object,
-            " -f -i ", address_list, " 2> /dev/null" );
-        // Get the function/line/file
-        int code;
-        auto cmd_output = StackTrace::exec( cmd, code );
-        auto output = breakString( cmd_output );
-        if ( output.size() != 2*info.size() )
-            return;
-        // Add the results to info
-        for (size_t i=0; i<info.size(); i++) {
-            // get function name
-            if ( info[i]->function.empty() )
-                info[i]->function = output[2*i+0];
-            // get file and line
-            const char *buf = output[2*i+1].c_str();
-            if ( buf[0] != '?' && buf[0] != 0 ) {
-                size_t j = 0;
-                for ( j = 0; j < 4095 && buf[j] != ':'; j++ ) {
-                }
-                info[i]->filename = std::string( buf, j );
-                info[i]->line     = atoi( &buf[j + 1] );
-            }
-        }
-    #elif defined( USE_MAC )
-        // Create the call command
-        void* load_address = loadAddress( info[0]->object );
-        if ( load_address == nullptr )
-            return;
-        std::vector<void*> address_list(info.size(),nullptr);
-        for (size_t i=0; i<info.size(); i++)
-            address_list[i] = info[i]->address;
-        // Call atos to get the object info
-        char tmp[64];
-        sprintf( tmp, " -l %lx ", (uint_p) load_address );
-        std::string cmd = generateCmd( "atos -o ", info[0]->object,
-            tmp, address_list, " 2> /dev/null" );
-        // Get the function/line/file
-        int code;
-        auto cmd_output = StackTrace::exec( cmd, code );
-        auto output = breakString( cmd_output );
-        if ( output.size() != info.size() )
-            return;
-        // Parse the output for function, file and line info
-        for ( size_t i=0; i<info.size(); i++) {
-            auto data = split_atos( output[i] );
-            if ( info[i]->function.empty() )
-                info[i]->function = std::get<0>(data);
-            if ( info[i]->object.empty() )
-                info[i]->object = std::get<1>(data);
-            if ( info[i]->filename.empty() )
-                info[i]->filename = std::get<2>(data);
-            if ( info[i]->line==0 )
-                info[i]->line = std::get<3>(data);
-        }
-    #endif
-}
-static void getFileAndLine( std::vector<StackTrace::stack_info> &info )
-{
-    // Build a list of stack elements for each object
-    std::map<std::string,std::vector<StackTrace::stack_info*>> obj_map;
-    for (auto & i : info) {
-        auto& list = obj_map[i.object];
-        list.emplace_back( &i );
-    }
-    // For each object, get the file/line numbers for all entries
-    for ( auto& entry : obj_map ) 
-        getFileAndLineObject( entry.second );
-}
-// Try to use the global symbols to decode info about the stack
-static void getDataFromGlobalSymbols( StackTrace::stack_info &info )
-{
-    const global_symbols_struct &data = getSymbols2();
-    if ( data.error == 0 ) {
-        size_t index = findfirst( global_symbols.address, info.address );
-        if ( index > 0 )
-            info.object = global_symbols.obj[index - 1];
-        else
-            info.object = std::string(global_exe_name);
-    }
-}
-static void signal_handler( int sig )
-{
-    printf("Signal caught acquiring stack (%i)\n",sig);
-    StackTrace::setErrorHandlers( [](std::string,StackTrace::terminateType) { exit( -1 ); } );
-}
-StackTrace::stack_info StackTrace::getStackInfo( void *address )
-{
-    return getStackInfo( std::vector<void*>(1,address) )[0];
-}
-std::vector<StackTrace::stack_info> StackTrace::getStackInfo( const std::vector<void*>& address )
-{
-    // Temporarily handle signals to prevent recursion on the stack
-    auto prev_handler = signal( SIGINT, signal_handler );
-    // Get the detailed stack info
-    std::vector<StackTrace::stack_info> info(address.size());
-    try {
-        #ifdef USE_WINDOWS
-            IMAGEHLP_SYMBOL64 pSym[1024];
-            memset( pSym, 0, sizeof( pSym ) );
-            pSym->SizeOfStruct  = sizeof( IMAGEHLP_SYMBOL64 );
-            pSym->MaxNameLength = 1024;
-
-            IMAGEHLP_MODULE64 Module;
-            memset( &Module, 0, sizeof( Module ) );
-            Module.SizeOfStruct = sizeof( Module );
-
-            HANDLE pid = GetCurrentProcess();
-
-            for (size_t i=0; i<address.size(); i++) {
-                info[i].address = address[i];
-                DWORD64 address2 = reinterpret_cast<DWORD64>( address[i] );
-                DWORD64 offsetFromSymbol;
-                if ( SymGetSymFromAddr( pid, address2, &offsetFromSymbol, pSym ) != FALSE ) {
-                    char name[8192]={0};
-                    DWORD rtn = UnDecorateSymbolName( pSym->Name, name, sizeof(name)-1, UNDNAME_COMPLETE );
-                    if ( rtn == 0 )
-                        info[i].function = std::string(pSym->Name);
-                    else
-                        info[i].function = std::string(name);
-                } else {
-                    printf( "ERROR: SymGetSymFromAddr (%d,%p)\n", GetLastError(), address2 );
-                }
-
-                // Get line number
-                IMAGEHLP_LINE64 Line;
-                memset( &Line, 0, sizeof( Line ) );
-                Line.SizeOfStruct = sizeof( Line );
-                DWORD offsetFromLine;
-                if ( SymGetLineFromAddr64( pid, address2, &offsetFromLine, &Line ) != FALSE ) {
-                    info[i].line     = Line.LineNumber;
-                    info[i].filename = std::string( Line.FileName );
-                } else {
-                    info[i].line     = 0;
-                    info[i].filename = std::string();
-                }
-
-                // Get the object
-                if ( SymGetModuleInfo64( pid, address2, &Module ) != FALSE ) {
-                    //info[i].object = std::string( Module.ModuleName );
-                    info[i].object = std::string( Module.LoadedImageName );
-                    //info[i].baseOfImage = Module.BaseOfImage;
-                }
-            }
-        #else
-            for (size_t i=0; i<address.size(); i++) {
-                info[i].address = address[i];
-                #if defined(_GNU_SOURCE) || defined(USE_MAC)
-                    Dl_info dlinfo;
-                    if ( !dladdr( info[i].address, &dlinfo ) ) {
-                        getDataFromGlobalSymbols( info[i] );
-                        continue;
-                    }
-                    info[i].address2 = subtractAddress( info[i].address, dlinfo.dli_fbase );
-                    info[i].object   = std::string( dlinfo.dli_fname );
-                    #if defined( USE_ABI )
-                        int status;
-                        char *demangled = abi::__cxa_demangle( dlinfo.dli_sname, nullptr, nullptr, &status );
-                        if ( status == 0 && demangled != nullptr ) {
-                            info[i].function = std::string( demangled );
-                        } else if ( dlinfo.dli_sname != nullptr ) {
-                            info[i].function = std::string( dlinfo.dli_sname );
-                        }
-                        free( demangled );
-                    #endif
-                    if ( dlinfo.dli_sname != nullptr && info[i].function.empty() )
-                        info[i].function = std::string( dlinfo.dli_sname );
-                #else
-                    getDataFromGlobalSymbols( info[i] );
-                #endif
-            }
-            // Get the filename / line numbers for each item on the stack
-            getFileAndLine( info );
-        #endif
-    } catch ( ... ) {
-    }
-    signal( SIGINT, prev_handler ) ;
-    return info;
-}
-
-
-/****************************************************************************
-*  Function to get the backtrace                                            *
-****************************************************************************/
-static int backtrace_thread( const std::thread::native_handle_type&, void**, size_t );
-#if defined( USE_LINUX ) || defined( USE_MAC )
-static int thread_backtrace_count;
-static void* thread_backtrace[1000];
-static std::mutex thread_backtrace_mutex;
-static void _callstack_signal_handler( int, siginfo_t*, void* )
-{
-    thread_backtrace_count = backtrace_thread( StackTrace::thisThread(), thread_backtrace, 1000 );
-}
-#endif
-static int backtrace_thread( const std::thread::native_handle_type& tid, void **buffer, size_t size )
-{
-    int count = 0;
-    #if defined( USE_LINUX ) || defined( USE_MAC )
-        // Get the trace
-        if ( tid == pthread_self() ) {
-            count = ::backtrace( buffer, size );
-        } else {
-            // Note: this will get the backtrace, but terminates the thread in the process!!!
-            thread_backtrace_mutex.lock();
-            struct sigaction sa;
-            sigfillset(&sa.sa_mask);
-            sa.sa_flags = SA_SIGINFO;
-            sa.sa_sigaction = _callstack_signal_handler;
-            sigaction(CALLSTACK_SIG, &sa, nullptr);
-            thread_backtrace_count = -1;
-            pthread_kill( tid, CALLSTACK_SIG );
-            auto t1 = std::chrono::high_resolution_clock::now();
-            auto t2 = std::chrono::high_resolution_clock::now();
-            while ( thread_backtrace_count==-1 && std::chrono::duration<double>(t2-t1).count()<0.15 ) {
-                std::this_thread::yield();
-                t2 = std::chrono::high_resolution_clock::now();
-            }
-            count = std::max(thread_backtrace_count,0);
-            memcpy( buffer, thread_backtrace, count*sizeof(void*) );
-            thread_backtrace_count = -1;
-            thread_backtrace_mutex.unlock();
-        }
-    #elif defined( USE_WINDOWS )
-        #if defined(DBGHELP)
-
-            // Load the modules for the stack trace
-            LoadModules();
-
-            // Initialize stackframe for first call
-            ::CONTEXT context;
-            memset( &context, 0, sizeof( context ) );
-            context.ContextFlags = CONTEXT_FULL;
-            RtlCaptureContext( &context );
-            STACKFRAME64 frame; // in/out stackframe
-            memset( &frame, 0, sizeof( frame ) );
-            #ifdef _M_IX86
-                DWORD imageType = IMAGE_FILE_MACHINE_I386;
-                frame.AddrPC.Offset    = context.Eip;
-                frame.AddrPC.Mode      = AddrModeFlat;
-                frame.AddrFrame.Offset = context.Ebp;
-                frame.AddrFrame.Mode   = AddrModeFlat;
-                frame.AddrStack.Offset = context.Esp;
-                frame.AddrStack.Mode   = AddrModeFlat;
-            #elif _M_X64
-                DWORD imageType = IMAGE_FILE_MACHINE_AMD64;
-                frame.AddrPC.Offset    = context.Rip;
-                frame.AddrPC.Mode      = AddrModeFlat;
-                frame.AddrFrame.Offset = context.Rsp;
-                frame.AddrFrame.Mode   = AddrModeFlat;
-                frame.AddrStack.Offset = context.Rsp;
-                frame.AddrStack.Mode   = AddrModeFlat;
-            #elif _M_IA64
-                DWORD imageType = IMAGE_FILE_MACHINE_IA64;
-                frame.AddrPC.Offset     = context.StIIP;
-                frame.AddrPC.Mode       = AddrModeFlat;
-                frame.AddrFrame.Offset  = context.IntSp;
-                frame.AddrFrame.Mode    = AddrModeFlat;
-                frame.AddrBStore.Offset = context.RsBSP;
-                frame.AddrBStore.Mode   = AddrModeFlat;
-                frame.AddrStack.Offset  = context.IntSp;
-                frame.AddrStack.Mode    = AddrModeFlat;
-            #else
-                #error "Platform not supported!"
-            #endif
-
-            auto pid = GetCurrentProcess();
-            for ( int frameNum = 0; frameNum<1024; ++frameNum ) {
-                BOOL rtn = StackWalk64( imageType, pid, tid, &frame, &context, readProcMem,
-                                        SymFunctionTableAccess, SymGetModuleBase64, NULL );
-                if ( !rtn ) {
-                    printf( "ERROR: StackWalk64 (%p)\n", frame.AddrPC.Offset );
-                    break;
-                }
-                if ( frame.AddrPC.Offset != 0 ) {
-                    buffer[count] = reinterpret_cast<void*>( frame.AddrPC.Offset ) );
-                    count++;
-                }
-                if ( frame.AddrReturn.Offset == 0 )
-                    break;
-            }
-            SetLastError( ERROR_SUCCESS );
-        #endif
-    #else
-        #warning Stack trace is not supported on this compiler/OS
-    #endif
-    return count;
-}
-std::vector<void*> StackTrace::backtrace( std::thread::native_handle_type tid )
-{
-    std::vector<void*> trace( 1000, nullptr );
-    size_t count = backtrace_thread( tid, trace.data(), trace.size() );
-    trace.resize(count);
-    return trace;
-}
-std::vector<void*> StackTrace::backtrace()
-{
-    std::vector<void*> trace( 1000, nullptr );
-    size_t count = backtrace_thread( thisThread(), trace.data(), trace.size() );
-    trace.resize(count);
-    return trace;
-}
-std::vector<std::vector<void *>> StackTrace::backtraceAll()
-{
-    // Get the list of threads
-    auto threads = activeThreads( );
-    // Get the backtrace of each thread
-    std::vector<std::vector<void*>> trace(threads.size());
-    size_t i = 0;
-    for ( auto it=threads.begin(); i<threads.size(); i++, it++ ) {
-        trace[i].resize(1000);
-        size_t count = backtrace_thread( *it, trace[i].data(), trace[i].size() );
-        trace[i].resize(count);
-    }
-    return trace;
-}
-
-
-/****************************************************************************
-*  Function to get the list of all active threads                           *
-****************************************************************************/
-#if defined( USE_LINUX )
-static std::thread::native_handle_type thread_handle;
-static bool thread_id_finished;
-static void _activeThreads_signal_handler( int )
-{
-    auto handle = StackTrace::thisThread( );
-    thread_handle = handle;
-    thread_id_finished = true;
-}
-static inline int get_tid( int pid, const std::string& line )
-{
-    char buf2[128]={0};
-    int i1 = 0;
-    while ( line[i1]==' ' && line[i1]!=0 ) { i1++; }
-    int i2 = i1;
-    while ( line[i2]!=' ' && line[i2]!=0 ) { i2++; }
-    memcpy(buf2,&line[i1],i2-i1);
-    buf2[i2-i1+1] = 0;
-    int pid2 = atoi(buf2);
-    if ( pid2 != pid )
-        return -1;
-    i1 = i2;
-    while ( line[i1]==' ' && line[i1]!=0 ) { i1++; }
-    i2 = i1;
-    while ( line[i2]!=' ' && line[i2]!=0 ) { i2++; }
-    memcpy(buf2,&line[i1],i2-i1);
-    buf2[i2-i1+1] = 0;
-    int tid = atoi(buf2);
-    return tid;
-}
-#endif
-std::thread::native_handle_type StackTrace::thisThread( )
-{
-    #if defined( USE_LINUX ) || defined( USE_MAC )
-        return pthread_self();
-    #elif defined( USE_WINDOWS )
-        return GetCurrentThread();
-    #else
-        #warning Stack trace is not supported on this compiler/OS
-        return std::thread::native_handle_type();
-    #endif
-}
-std::set<std::thread::native_handle_type> StackTrace::activeThreads( )
-{
-    std::set<std::thread::native_handle_type> threads;
-    #if defined( USE_LINUX )
-        std::set<int> tid;
-        int pid = getpid();
-        char cmd[128];
-        sprintf( cmd, "ps -T -p %i", pid );
-        signal( SIGCHLD, SIG_DFL );     // Clear child exited
-        int code;
-        auto output = breakString( exec( cmd, code ) );
-        for ( const auto& line : output ) {
-            int tid2 = get_tid( pid, line );
-            if ( tid2 != -1 )
-                tid.insert( tid2 );
-        }
-        tid.erase( syscall(SYS_gettid) );
-        signal( CALLSTACK_SIG, _activeThreads_signal_handler );
-        for ( auto tid2 : tid ) {
-            thread_backtrace_mutex.lock();
-            thread_id_finished = false;
-            thread_handle = thisThread();
-            syscall( SYS_tgkill, pid, tid2, CALLSTACK_SIG );
-            auto t1 = std::chrono::high_resolution_clock::now();
-            auto t2 = std::chrono::high_resolution_clock::now();
-            while ( !thread_id_finished && std::chrono::duration<double>(t2-t1).count()<0.1 ) {
-                std::this_thread::yield();
-                t2 = std::chrono::high_resolution_clock::now();
-            }
-            threads.insert( thread_handle );
-            thread_backtrace_mutex.unlock();
-        }
-    #elif defined( USE_MAC )
-        printf("activeThreads not finished\n");
-    #elif defined( USE_WINDOWS )
-        HANDLE hThreadSnap = CreateToolhelp32Snapshot( TH32CS_SNAPTHREAD, 0 ); 
-        if( hThreadSnap != INVALID_HANDLE_VALUE ) {
-            // Fill in the size of the structure before using it
-            THREADENTRY32 te32
-            te32.dwSize = sizeof(THREADENTRY32 );
-            // Retrieve information about the first thread, and exit if unsuccessful
-            if( !Thread32First( hThreadSnap, &te32 ) ) {
-                printError( TEXT("Thread32First") );    // Show cause of failure
-                CloseHandle( hThreadSnap );             // Must clean up the snapshot object!
-                return( FALSE );
-            }
-            // Now walk the thread list of the system
-            do { 
-                if ( te32.th32OwnerProcessID == dwOwnerPID )
-                    threads.insert( te32.th32ThreadID );
-            } while( Thread32Next(hThreadSnap, &te32 ) );
-            CloseHandle( hThreadSnap );                 // Must clean up the snapshot object!
-        }
-    #else
-        #warning activeThreads is not yet supported on this compiler/OS
-    #endif
-    threads.insert( thisThread() );
-    if ( globalMonitorThread )
-        threads.erase( globalMonitorThread->native_handle() );
-    return threads;
-}
-// clang-format on
-
-
-/****************************************************************************
- *  Function to get the current call stack                                   *
- ****************************************************************************/
-std::vector<StackTrace::stack_info> StackTrace::getCallStack()
-{
-    auto trace = StackTrace::backtrace();
-    auto info  = getStackInfo( trace );
-    return info;
-}
-std::vector<StackTrace::stack_info> StackTrace::getCallStack( std::thread::native_handle_type id )
-{
-    auto trace = StackTrace::backtrace( id );
-    auto info  = getStackInfo( trace );
-    return info;
-}
-static StackTrace::multi_stack_info
-generateMultiStack( const std::vector<std::vector<void *>> &thread_backtrace )
-{
-    // Get the stack data for all pointers
-    std::set<void *> addresses_set;
-    for ( const auto &trace : thread_backtrace ) {
-        for ( auto ptr : trace )
-            addresses_set.insert( ptr );
-    }
-    std::vector<void *> addresses( addresses_set.begin(), addresses_set.end() );
-    auto stack_data = StackTrace::getStackInfo( addresses );
-    std::map<void *, StackTrace::stack_info> map_data;
-    for ( size_t i = 0; i < addresses.size(); i++ )
-        map_data.insert( std::make_pair( addresses[i], stack_data[i] ) );
-    // Create the multi-stack trace
-    StackTrace::multi_stack_info multistack;
-    for ( const auto &trace : thread_backtrace ) {
-        if ( trace.empty() )
-            continue;
-        // Create the stack for the given thread trace
-        std::vector<StackTrace::stack_info> stack( trace.size() );
-        for ( size_t i = 0; i < trace.size(); i++ )
-            stack[i] = map_data[trace[i]];
-        // Add the data to the multistack
-        multistack.add( stack.size(), stack.data() );
-    }
-    return multistack;
-}
-StackTrace::multi_stack_info StackTrace::getAllCallStacks()
-{
-    // Get the backtrace of each thread
-    auto thread_backtrace = backtraceAll();
-    // Create the multi-stack strucutre
-    auto stack = generateMultiStack( thread_backtrace );
-    return stack;
-}
-
-
-/****************************************************************************
- *  Function to get system search paths                                      *
- ****************************************************************************/
-std::string StackTrace::getSymPaths()
-{
-    std::string paths;
-#ifdef USE_WINDOWS
-    // Create the path list (seperated by ';' )
-    paths = std::string( ".;" );
-    paths.reserve( 1000 );
-    // Add the current directory
-    paths += getCurrentDirectory() + ";";
-    // Now add the path for the main-module:
-    char temp[1024];
-    memset( temp, 0, sizeof( temp ) );
-    if ( GetModuleFileNameA( nullptr, temp, sizeof( temp ) - 1 ) > 0 ) {
-        for ( char *p = ( temp + strlen( temp ) - 1 ); p >= temp; --p ) {
-            // locate the rightmost path separator
-            if ( ( *p == '\\' ) || ( *p == '/' ) || ( *p == ':' ) ) {
-                *p = 0;
-                break;
-            }
-        }
-        if ( strlen( temp ) > 0 ) {
-            paths += temp;
-            paths += ";";
-        }
-    }
-    memset( temp, 0, sizeof( temp ) );
-    if ( GetEnvironmentVariableA( "_NT_SYMBOL_PATH", temp, sizeof( temp ) - 1 ) > 0 ) {
-        paths += temp;
-        paths += ";";
-    }
-    memset( temp, 0, sizeof( temp ) );
-    if ( GetEnvironmentVariableA( "_NT_ALTERNATE_SYMBOL_PATH", temp, sizeof( temp ) - 1 ) > 0 ) {
-        paths += temp;
-        paths += ";";
-    }
-    memset( temp, 0, sizeof( temp ) );
-    if ( GetEnvironmentVariableA( "SYSTEMROOT", temp, sizeof( temp ) - 1 ) > 0 ) {
-        paths += temp;
-        paths += ";";
-        // also add the "system32"-directory:
-        paths += temp;
-        paths += "\\system32;";
-    }
-    memset( temp, 0, sizeof( temp ) );
-    if ( GetEnvironmentVariableA( "SYSTEMDRIVE", temp, sizeof( temp ) - 1 ) > 0 ) {
-        paths += "SRV*;" + std::string( temp ) +
-                 "\\websymbols*http://msdl.microsoft.com/download/symbols;";
-    } else {
-        paths += "SRV*c:\\websymbols*http://msdl.microsoft.com/download/symbols;";
-    }
-#endif
-    return paths;
-}
-
-
-/****************************************************************************
- *  Load modules for windows                                                 *
- ****************************************************************************/
-#ifdef USE_WINDOWS
-BOOL StackTrace::GetModuleListTH32( HANDLE hProcess, DWORD pid )
-{
-    // CreateToolhelp32Snapshot()
-    typedef HANDLE( __stdcall * tCT32S )( DWORD dwFlags, DWORD th32ProcessID );
-    // Module32First()
-    typedef BOOL( __stdcall * tM32F )( HANDLE hSnapshot, LPMODULEENTRY32 lpme );
-    // Module32Next()
-    typedef BOOL( __stdcall * tM32N )( HANDLE hSnapshot, LPMODULEENTRY32 lpme );
-
-    // try both dlls...
-    const TCHAR *dllname[] = { _T("kernel32.dll"), _T("tlhelp32.dll") };
-    HINSTANCE hToolhelp    = nullptr;
-    tCT32S pCT32S          = nullptr;
-    tM32F pM32F            = nullptr;
-    tM32N pM32N            = nullptr;
-
-    HANDLE hSnap;
-    MODULEENTRY32 me;
-    me.dwSize = sizeof( me );
-
-    for ( size_t i = 0; i < ( sizeof( dllname ) / sizeof( dllname[0] ) ); i++ ) {
-        hToolhelp = LoadLibrary( dllname[i] );
-        if ( hToolhelp == nullptr )
-            continue;
-        pCT32S = (tCT32S) GetProcAddress( hToolhelp, "CreateToolhelp32Snapshot" );
-        pM32F  = (tM32F) GetProcAddress( hToolhelp, "Module32First" );
-        pM32N  = (tM32N) GetProcAddress( hToolhelp, "Module32Next" );
-        if ( ( pCT32S != nullptr ) && ( pM32F != nullptr ) && ( pM32N != nullptr ) )
-            break; // found the functions!
-        FreeLibrary( hToolhelp );
-        hToolhelp = nullptr;
-    }
-
-    if ( hToolhelp == nullptr )
-        return FALSE;
-
-    hSnap = pCT32S( TH32CS_SNAPMODULE, pid );
-    if ( hSnap == (HANDLE) -1 ) {
-        FreeLibrary( hToolhelp );
-        return FALSE;
-    }
-
-    bool keepGoing = !!pM32F( hSnap, &me );
-    int cnt        = 0;
-    while ( keepGoing ) {
-        LoadModule( hProcess, me.szExePath, me.szModule, (DWORD64) me.modBaseAddr, me.modBaseSize );
-        cnt++;
-        keepGoing = !!pM32N( hSnap, &me );
-    }
-    CloseHandle( hSnap );
-    FreeLibrary( hToolhelp );
-    if ( cnt <= 0 )
-        return FALSE;
-    return TRUE;
-}
-DWORD StackTrace::LoadModule(
-    HANDLE hProcess, LPCSTR img, LPCSTR mod, DWORD64 baseAddr, DWORD size )
-{
-    CHAR *szImg  = _strdup( img );
-    CHAR *szMod  = _strdup( mod );
-    DWORD result = ERROR_SUCCESS;
-    if ( ( szImg == nullptr ) || ( szMod == nullptr ) ) {
-        result = ERROR_NOT_ENOUGH_MEMORY;
-    } else {
-        if ( SymLoadModule( hProcess, 0, szImg, szMod, baseAddr, size ) == 0 )
-            result = GetLastError();
-    }
-    ULONGLONG fileVersion = 0;
-    if ( szImg != nullptr ) {
-        // try to retrive the file-version:
-        VS_FIXEDFILEINFO *fInfo = nullptr;
-        DWORD dwHandle;
-        DWORD dwSize = GetFileVersionInfoSizeA( szImg, &dwHandle );
-        if ( dwSize > 0 ) {
-            LPVOID vData = malloc( dwSize );
-            if ( vData != nullptr ) {
-                if ( GetFileVersionInfoA( szImg, dwHandle, dwSize, vData ) != 0 ) {
-                    UINT len;
-                    TCHAR szSubBlock[] = _T("\\");
-                    if ( VerQueryValue( vData, szSubBlock, (LPVOID *) &fInfo, &len ) == 0 ) {
-                        fInfo = nullptr;
-                    } else {
-                        fileVersion = ( (ULONGLONG) fInfo->dwFileVersionLS ) +
-                                      ( (ULONGLONG) fInfo->dwFileVersionMS << 32 );
-                    }
-                }
-                free( vData );
-            }
-        }
-
-        // Retrive some additional-infos about the module
-        IMAGEHLP_MODULE64 Module;
-        Module.SizeOfStruct = sizeof( IMAGEHLP_MODULE64 );
-        SymGetModuleInfo64( hProcess, baseAddr, &Module );
-        LPCSTR pdbName = Module.LoadedImageName;
-        if ( Module.LoadedPdbName[0] != 0 )
-            pdbName = Module.LoadedPdbName;
-    }
-    if ( szImg != nullptr )
-        free( szImg );
-    if ( szMod != nullptr )
-        free( szMod );
-    return result;
-}
-BOOL StackTrace::GetModuleListPSAPI( HANDLE hProcess )
-{
-    DWORD cbNeeded;
-    HMODULE hMods[1024];
-    char tt[8192];
-    char tt2[8192];
-    if ( !EnumProcessModules( hProcess, hMods, sizeof( hMods ), &cbNeeded ) ) {
-        return false;
-    }
-    if ( cbNeeded > sizeof( hMods ) ) {
-        printf( "Insufficient memory allocated in GetModuleListPSAPI\n" );
-        return false;
-    }
-    int cnt = 0;
-    for ( DWORD i = 0; i < cbNeeded / sizeof( hMods[0] ); i++ ) {
-        // base address, size
-        MODULEINFO mi;
-        GetModuleInformation( hProcess, hMods[i], &mi, sizeof( mi ) );
-        // image file name
-        tt[0] = 0;
-        GetModuleFileNameExA( hProcess, hMods[i], tt, sizeof( tt ) );
-        // module name
-        tt2[0] = 0;
-        GetModuleBaseNameA( hProcess, hMods[i], tt2, sizeof( tt2 ) );
-        DWORD dwRes = LoadModule( hProcess, tt, tt2, (DWORD64) mi.lpBaseOfDll, mi.SizeOfImage );
-        if ( dwRes != ERROR_SUCCESS )
-            printf( "ERROR: LoadModule (%d)\n", dwRes );
-        cnt++;
-    }
-
-    return cnt != 0;
-}
-void StackTrace::LoadModules()
-{
-    static bool modules_loaded = false;
-    if ( !modules_loaded ) {
-        modules_loaded = true;
-
-        // Get the search paths for symbols
-        std::string paths = StackTrace::getSymPaths();
-
-        // Initialize the symbols
-        if ( SymInitialize( GetCurrentProcess(), paths.c_str(), FALSE ) == FALSE )
-            printf( "ERROR: SymInitialize (%d)\n", GetLastError() );
-
-        DWORD symOptions = SymGetOptions();
-        symOptions |= SYMOPT_LOAD_LINES | SYMOPT_FAIL_CRITICAL_ERRORS;
-        symOptions     = SymSetOptions( symOptions );
-        char buf[1024] = { 0 };
-        if ( SymGetSearchPath( GetCurrentProcess(), buf, sizeof( buf ) ) == FALSE )
-            printf( "ERROR: SymGetSearchPath (%d)\n", GetLastError() );
-
-        // First try to load modules from toolhelp32
-        BOOL loaded = StackTrace::GetModuleListTH32( GetCurrentProcess(), GetCurrentProcessId() );
-
-        // Try to load from Psapi
-        if ( !loaded )
-            loaded = StackTrace::GetModuleListPSAPI( GetCurrentProcess() );
-    }
-}
-#endif
-
-
-/****************************************************************************
- *  Get the signal name                                                      *
- ****************************************************************************/
-std::string StackTrace::signalName( int sig ) { return std::string( strsignal( sig ) ); }
-std::vector<int> StackTrace::allSignalsToCatch()
-{
-    std::set<int> signals;
-    for ( int i = 1; i < 32; i++ )
-        signals.insert( i );
-    for ( int i = SIGRTMIN; i <= SIGRTMAX; i++ )
-        signals.insert( i );
-    signals.erase( SIGKILL );
-    signals.erase( SIGSTOP );
-    return std::vector<int>( signals.begin(), signals.end() );
-}
-std::vector<int> StackTrace::defaultSignalsToCatch()
-{
-    auto tmp = allSignalsToCatch();
-    std::set<int> signals( tmp.begin(), tmp.end() );
-    signals.erase( SIGWINCH ); // Don't catch window changed by default
-    signals.erase( SIGCONT );  // Don't catch continue by default
-    return std::vector<int>( signals.begin(), signals.end() );
-}
-
-
-/****************************************************************************
- *  Set the signal handlers                                                  *
- ****************************************************************************/
-static std::function<void( std::string, StackTrace::terminateType )> abort_fun;
-static std::string rethrow()
-{
-    std::string last_message;
-#ifdef USE_LINUX
-    try {
-        static int tried_throw = 0;
-        if ( tried_throw == 0 ) {
-            tried_throw = 1;
-            throw;
-        }
-        // No active exception
-    } catch ( const std::exception &err ) {
-        // Caught a std::runtime_error
-        last_message = err.what();
-    } catch ( ... ) {
-        // Caught an unknown exception
-        last_message = "unknown exception occurred.";
-    }
-#endif
-    return last_message;
-}
-static void term_func_abort( int sig )
-{
-    std::string msg( "Caught signal: " );
-    msg += StackTrace::signalName( sig );
-    abort_fun( msg, StackTrace::terminateType::signal );
-}
-static std::set<int> signals_set = std::set<int>();
-static void term_func()
-{
-    std::string last_message = rethrow();
-    StackTrace::clearSignals();
-    abort_fun( "Unhandled exception:\n" + last_message, StackTrace::terminateType::exception );
-}
-void StackTrace::clearSignal( int sig )
-{
-    if ( signals_set.find( sig ) != signals_set.end() ) {
-        signal( sig, SIG_DFL );
-        signals_set.erase( sig );
-    }
-}
-void StackTrace::clearSignals()
-{
-    for ( auto sig : signals_set )
-        signal( sig, SIG_DFL );
-    signals_set.clear();
-}
-void StackTrace::setSignals( const std::vector<int> &signals, void ( *handler )( int ) )
-{
-    for ( auto sig : signals ) {
-        signal( sig, handler );
-        signals_set.insert( sig );
-    }
-}
-void StackTrace::setErrorHandlers(
-    std::function<void( std::string, StackTrace::terminateType )> abort )
-{
-    abort_fun = abort;
-    std::set_terminate( term_func );
-    setSignals( defaultSignalsToCatch(), &term_func_abort );
-    std::set_unexpected( term_func );
-}
-
-
-/****************************************************************************
- *  Global call stack functionallity                                         *
- ****************************************************************************/
-#ifdef USE_MPI
-static MPI_Comm globalCommForGlobalCommStack = MPI_COMM_NULL;
-static bool stopGlobalMonitorThread          = false;
-static void runGlobalMonitorThread()
-{
-    int rank = 0;
-    int size = 1;
-    MPI_Comm_size( globalCommForGlobalCommStack, &size );
-    MPI_Comm_rank( globalCommForGlobalCommStack, &rank );
-    while ( !stopGlobalMonitorThread ) {
-        // Check for any messages
-        int flag = 0;
-        MPI_Status status;
-        int err = MPI_Iprobe( MPI_ANY_SOURCE, 1, globalCommForGlobalCommStack, &flag, &status );
-        if ( err != MPI_SUCCESS ) {
-            printf( "Internal error in StackTrace::getGlobalCallStacks::runGlobalMonitorThread\n" );
-            break;
-        } else if ( flag != 0 ) {
-            // We received a request
-            int src_rank = status.MPI_SOURCE;
-            int tag;
-            MPI_Recv( &tag, 1, MPI_INT, src_rank, 1, globalCommForGlobalCommStack, &status );
-            // Get a trace of all threads (except this)
-            auto threads = StackTrace::activeThreads();
-            threads.erase( StackTrace::thisThread() );
-            if ( threads.empty() )
-                continue;
-            // Get the stack trace of each thread
-            std::vector<std::vector<StackTrace::stack_info>> stack;
-            for ( auto thread : threads )
-                stack.push_back( StackTrace::getCallStack( thread ) );
-            // Pack and send the data
-            auto data = pack( stack );
-            int count = data.size();
-            MPI_Send( data.data(), count, MPI_CHAR, src_rank, tag, globalCommForGlobalCommStack );
-        } else {
-            // No requests recieved
-            std::this_thread::sleep_for( std::chrono::milliseconds( 50 ) );
-        }
-    }
-}
-void StackTrace::globalCallStackInitialize( MPI_Comm comm )
-{
-#ifdef USE_MPI
-    MPI_Comm_dup( comm, &globalCommForGlobalCommStack );
-#endif
-    stopGlobalMonitorThread = false;
-    globalMonitorThread.reset( new std::thread( runGlobalMonitorThread ) );
-}
-void StackTrace::globalCallStackFinalize()
-{
-    stopGlobalMonitorThread = true;
-    globalMonitorThread->join();
-    globalMonitorThread.reset();
-#ifdef USE_MPI
-    if ( globalCommForGlobalCommStack != MPI_COMM_NULL )
-        MPI_Comm_free( &globalCommForGlobalCommStack );
-    globalCommForGlobalCommStack = MPI_COMM_NULL;
-#endif
-}
-StackTrace::multi_stack_info StackTrace::getGlobalCallStacks()
-{
-    // Check if we properly initialized the comm
-    if ( globalMonitorThread == nullptr ) {
-        printf( "Warning: getGlobalCallStacks called without call to globalCallStackInitialize\n" );
-        return getAllCallStacks();
-    }
-    if ( globalMonitorThread == nullptr ) {
-        printf( "Warning: getGlobalCallStacks called without call to globalCallStackInitialize\n" );
-        return getAllCallStacks();
-    }
-#ifdef USE_MPI
-    int provided;
-    MPI_Query_thread( &provided );
-    if ( provided != MPI_THREAD_MULTIPLE ) {
-        printf( "Warning: getGlobalCallStacks requires support for MPI_THREAD_MULTIPLE\n" );
-        return getAllCallStacks();
-    }
-#endif
-    if ( activeThreads().size() == 1 ) {
-        printf( "Warning: getAllCallStacks not supported on this OS, defaulting to basic call "
-                "stack\n" );
-        return getAllCallStacks();
-    }
-    // Signal all processes that we want their stack for all threads
-    int rank = 0;
-    int size = 1;
-    MPI_Comm_size( globalCommForGlobalCommStack, &size );
-    MPI_Comm_rank( globalCommForGlobalCommStack, &rank );
-    std::random_device rd;
-    std::mt19937 gen( rd() );
-    std::uniform_int_distribution<> dis( 2, 0x7FFF );
-    int tag = dis( gen );
-    std::vector<MPI_Request> sendRequest( size );
-    for ( int i = 0; i < size; i++ ) {
-        if ( i == rank )
-            continue;
-        MPI_Isend( &tag, 1, MPI_INT, i, 1, globalCommForGlobalCommStack, &sendRequest[i] );
-    }
-    // Get the trace for the current process
-    auto threads = StackTrace::activeThreads();
-    StackTrace::multi_stack_info multistack;
-    for ( auto thread : threads ) {
-        auto stack = StackTrace::getCallStack( thread );
-        multistack.add( stack.size(), stack.data() );
-    }
-    // Recieve the backtrace for all processes/threads
-    int N_finished        = 1;
-    auto start            = std::chrono::steady_clock::now();
-    double time           = 0;
-    const double max_time = 2.0 + size * 20e-3;
-    while ( N_finished < size && time < max_time ) {
-        int flag = 0;
-        MPI_Status status;
-        int err = MPI_Iprobe( MPI_ANY_SOURCE, tag, globalCommForGlobalCommStack, &flag, &status );
-        if ( err != MPI_SUCCESS ) {
-            printf( "Internal error in StackTrace::getGlobalCallStacks\n" );
-            break;
-        } else if ( flag != 0 ) {
-            // We recieved a response
-            int src_rank = status.MPI_SOURCE;
-            int count;
-            MPI_Get_count( &status, MPI_CHAR, &count );
-            std::vector<char> data( count, 0 );
-            MPI_Recv( data.data(),
-                      count,
-                      MPI_CHAR,
-                      src_rank,
-                      tag,
-                      globalCommForGlobalCommStack,
-                      &status );
-            auto stack_list = unpack( data );
-            for ( const auto &stack : stack_list )
-                multistack.add( stack.size(), stack.data() );
-            N_finished++;
-        } else {
-            auto stop = std::chrono::steady_clock::now();
-            time      = std::chrono::duration_cast<std::chrono::seconds>( stop - start ).count();
-            std::this_thread::yield();
-        }
-    }
-    for ( int i = 0; i < size; i++ ) {
-        if ( i == rank )
-            continue;
-        MPI_Request_free( &sendRequest[i] );
-    }
-    return multistack;
-}
-#else
-void StackTrace::globalCallStackInitialize( MPI_Comm ) {}
-void StackTrace::globalCallStackFinalize() {}
-StackTrace::multi_stack_info StackTrace::getGlobalCallStacks() { return getAllCallStacks(); }
-#endif
-
-
-/****************************************************************************
- *  Cleanup the call stack                                                   *
- ****************************************************************************/
-static inline size_t findMatching( const std::string &str, size_t pos )
-{
-    if ( str[pos] != '<' ) {
-        perr << "Internal error string matching\n";
-        perr << "   " << str << std::endl;
-        perr << "   " << pos << std::endl;
-        return pos;
-    }
-    size_t pos2 = pos + 1;
-    int count   = 1;
-    while ( count != 0 && pos2 < str.size() ) {
-        if ( str[pos2] == '<' )
-            count++;
-        if ( str[pos2] == '>' )
-            count--;
-        pos2++;
-    }
-    return pos2;
-}
-void StackTrace::cleanupStackTrace( multi_stack_info &stack )
-{
-    auto it           = stack.children.begin();
-    const size_t npos = std::string::npos;
-    while ( it != stack.children.end() ) {
-        auto &object      = it->stack.object;
-        auto &function    = it->stack.function;
-        auto &filename    = it->stack.filename;
-        bool remove_entry = false;
-        // Cleanup object and filename
-        object   = stripPath( object );
-        filename = stripPath( filename );
-        // Remove callstack (and all children) for threads that are just contributing
-        if ( function.find( "_callstack_signal_handler" ) != npos &&
-             filename.find( "StackTrace.cpp" ) != npos ) {
-            it = stack.children.erase( it );
-            continue;
-        }
-        // Remove __libc_start_main
-        if ( function.find( "__libc_start_main" ) != npos &&
-             filename.find( "libc-start.c" ) != npos )
-            remove_entry = true;
-        // Remove backtrace_thread
-        if ( function.find( "backtrace_thread" ) != npos &&
-             filename.find( "StackTrace.cpp" ) != npos )
-            remove_entry = true;
-        // Remove __restore_rt
-        if ( function.find( "__restore_rt" ) != npos && object.find( "libpthread" ) != npos )
-            remove_entry = true;
-        // Remove std::condition_variable::__wait_until_impl
-        if ( function.find( "std::condition_variable::__wait_until_impl" ) != npos &&
-             filename == "condition_variable" )
-            remove_entry = true;
-        // Remove std::_Function_handler<
-        if ( function.find( "std::_Function_handler<" ) != npos && filename == "functional" )
-            remove_entry = true;
-        // Remove std::_Bind_simple<
-        if ( function.find( "std::_Bind_simple<" ) != npos && filename == "functional" ) {
-            auto pos     = function.find( "std::_Bind_simple<" );
-            function     = function.substr( 0, pos ) + "std::_Bind_simple<...>(...)";
-            remove_entry = true;
-        }
-        // Remove std::this_thread::__sleep_for
-        if ( function.find( "std::this_thread::__sleep_for(" ) != npos &&
-             object.find( "libstdc++" ) != npos )
-            remove_entry = true;
-        // Remove std::thread::_Impl
-        if ( function.find( "std::thread::_Impl<" ) != npos && filename == "thread" )
-            remove_entry = true;
-        // Remove MATLAB internal routines
-        if ( object == "libmwmcr.so" || object == "libmwm_lxe.so" || object == "libmwbridge.so" ||
-             object == "libmwiqm.so" )
-            remove_entry = true;
-        // Remove the desired entry
-        if ( remove_entry ) {
-            if ( it->children.empty() ) {
-                it = stack.children.erase( it );
-                continue;
-            } else if ( it->children.size() == 1 ) {
-                *it = it->children[0];
-                continue;
-            }
-        }
-        // Cleanup template space
-        strrep( function, " >", ">" );
-        strrep( function, "< ", "<" );
-        // Replace std::chrono::duration with abbriviated version
-        if ( function.find( "std::chrono::duration<" ) != npos ) {
-            strrep( function, "std::chrono::duration<long, std::ratio<1l, 1l> >", "ticks" );
-            strrep( function,
-                    "std::chrono::duration<long, std::ratio<1l, 1000000000l> >",
-                    "nanoseconds" );
-        }
-        // Replace std::ratio with abbriviated version.
-        if ( function.find( "std::ratio<" ) != npos ) {
-            strrep( function, "std::ratio<1l, 1000000000000000000000000l>", "std::yocto" );
-            strrep( function, "std::ratio<1l, 1000000000000000000000l>", "std::zepto" );
-            strrep( function, "std::ratio<1l, 1000000000000000000l>", "std::atto" );
-            strrep( function, "std::ratio<1l, 1000000000000000l>", "std::femto" );
-            strrep( function, "std::ratio<1l, 1000000000000l>", "std::pico" );
-            strrep( function, "std::ratio<1l, 1000000000l>", "std::nano" );
-            strrep( function, "std::ratio<1l, 1000000l>", "std::micro" );
-            strrep( function, "std::ratio<1l, 1000l>", "std::milli" );
-            strrep( function, "std::ratio<1l, 100l>", "std::centi" );
-            strrep( function, "std::ratio<1l, 10l>", "std::deci" );
-            strrep( function, "std::ratio<1l, 1l>", "" );
-            strrep( function, "std::ratio<10l, 1l>", "std::deca" );
-            strrep( function, "std::ratio<60l, 1l>", "std::ratio<60>" );
-            strrep( function, "std::ratio<100l, 1l>", "std::hecto" );
-            strrep( function, "std::ratio<1000l, 1l>", "std::kilo" );
-            strrep( function, "std::ratio<3600l, 1l>", "std::ratio<3600>" );
-            strrep( function, "std::ratio<1000000l, 1l>", "std::mega" );
-            strrep( function, "std::ratio<1000000000l, 1l>", "std::giga" );
-            strrep( function, "std::ratio<1000000000000l, 1l>", "std::tera" );
-            strrep( function, "std::ratio<1000000000000000l, 1l>", "std::peta" );
-            strrep( function, "std::ratio<1000000000000000000l, 1l>", "std::exa" );
-            strrep( function, "std::ratio<1000000000000000000000l, 1l>", "std::zetta" );
-            strrep( function, "std::ratio<1000000000000000000000000l, 1l>", "std::yotta" );
-            strrep( function, " >", ">" );
-            strrep( function, "< ", "<" );
-        }
-        // Replace std::chrono::duration with abbriviated version.
-        if ( function.find( "std::chrono::duration<" ) != npos ) {
-            // clang-format off
-            strrep( function, "std::chrono::duration<long, std::nano>", "std::chrono::nanoseconds" );
-            strrep( function, "std::chrono::duration<long, std::micro>", "std::chrono::microseconds" );
-            strrep( function, "std::chrono::duration<long, std::milli>", "std::chrono::milliseconds" );
-            strrep( function, "std::chrono::duration<long>", "std::chrono::seconds" );
-            strrep( function, "std::chrono::duration<long,>", "std::chrono::seconds" );
-            strrep( function, "std::chrono::duration<long, std::ratio<60>>", "std::chrono::minutes" );
-            strrep( function, "std::chrono::duration<long, std::ratio<3600>>", "std::chrono::hours" );
-            strrep( function, " >", ">" );
-            strrep( function, "< ", "<" );
-            // clang-format on
-        }
-        // Replace std::this_thread::sleep_for with abbriviated version.
-        if ( function.find( "::sleep_for<" ) != npos ) {
-            strrep( function, "::sleep_for<long, std::nano>", "::sleep_for<nanoseconds>" );
-            strrep( function, "::sleep_for<long, std::micro>", "::sleep_for<microseconds>" );
-            strrep( function, "::sleep_for<long, std::milli>", "::sleep_for<milliseconds>" );
-            strrep( function, "::sleep_for<long>", "::sleep_for<seconds>" );
-            strrep( function, "::sleep_for<long,>", "::sleep_for<seconds>" );
-            strrep( function, "::sleep_for<long, std::ratio<60>>", "::sleep_for<minutes>" );
-            strrep( function, "::sleep_for<long, std::ratio<3600>>", "::sleep_for<hours>" );
-            strrep( function,
-                    "::sleep_for<nanoseconds>(std::chrono::nanoseconds",
-                    "::sleep_for(std::chrono::nanoseconds" );
-            strrep( function,
-                    "::sleep_for<microseconds>(std::chrono::microseconds",
-                    "::sleep_for(std::chrono::microseconds" );
-            strrep( function,
-                    "::sleep_for<milliseconds>(std::chrono::milliseconds",
-                    "::sleep_for(std::chrono::milliseconds" );
-            strrep( function,
-                    "::sleep_for<seconds>(std::chrono::seconds",
-                    "::sleep_for(std::chrono::seconds" );
-            strrep( function,
-                    "::sleep_for<milliseconds>(std::chrono::minutes",
-                    "::sleep_for(std::chrono::milliseconds" );
-            strrep( function,
-                    "::sleep_for<milliseconds>(std::chrono::hours",
-                    "::sleep_for(std::chrono::hours" );
-        }
-        // Replace std::basic_string with abbriviated version
-        size_t pos = 0;
-        while ( pos < function.size() ) {
-            // Find next instance of std::basic_string
-            const std::string match = "std::basic_string<";
-            pos                     = function.find( match, pos );
-            if ( pos == npos )
-                break;
-            // Find the matching >
-            size_t pos1 = pos + match.size() - 1;
-            size_t pos2 = findMatching( function, pos1 );
-            if ( pos2 == pos1 )
-                break;
-            if ( function.substr( pos1 + 1, 4 ) == "char" )
-                function.replace( pos, pos2 - pos, "std::string" );
-            else if ( function.substr( pos1 + 1, 7 ) == "wchar_t" )
-                function.replace( pos, pos2 - pos, "std::wstring" );
-            else if ( function.substr( pos1 + 1, 8 ) == "char16_t" )
-                function.replace( pos, pos2 - pos, "std::u16string" );
-            else if ( function.substr( pos1 + 1, 8 ) == "char32_t" )
-                function.replace( pos, pos2 - pos, "std::u32string" );
-            pos++;
-        }
-        // Cleanup the children
-        cleanupStackTrace( *it );
-        ++it;
-    }
-}
diff --git a/common/Utilities.cpp b/common/Utilities.cpp
index d34385a2..f6d810af 100644
--- a/common/Utilities.cpp
+++ b/common/Utilities.cpp
@@ -1,303 +1,8 @@
 #include "common/Utilities.h"
-#include "common/StackTrace.h"
 
-#include <iostream>
-#include <sstream>
-#include <stdexcept>
-#include <fstream>
-#include <string.h>
-#include <signal.h>
 #include <math.h>
 #include <algorithm>
 
-#ifdef USE_MPI
-    #include "mpi.h"
-#endif
-
-// Detect the OS and include system dependent headers
-#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) || defined(_MSC_VER)
-    // Note: windows has not been testeds
-    #define USE_WINDOWS
-    #include <windows.h>
-    #include <process.h>
-    #include <stdio.h>   
-    #include <tchar.h>
-    #include <psapi.h>
-    #include <DbgHelp.h>
-    #define mkdir(path, mode) _mkdir(path)
-    //#pragma comment(lib, psapi.lib) //added
-    //#pragma comment(linker, /DEFAULTLIB:psapi.lib)
-#elif defined(__APPLE__)
-    #define USE_MAC
-    #include <sys/time.h>
-    #include <signal.h>
-    #include <execinfo.h>
-    #include <dlfcn.h>
-    #include <mach/mach.h>
-    #include <unistd.h>
-#elif defined( __linux ) || defined( __linux__ ) || defined( __unix ) || defined( __posix )
-    #define USE_LINUX
-    #include <sys/time.h>
-    #include <execinfo.h>
-    #include <dlfcn.h>
-    #include <malloc.h>
-    #include <unistd.h>
-#else
-    #error Unknown OS
-#endif
-
-
-#ifdef __GNUC__
-    #define USE_ABI
-    #include <cxxabi.h>
-#endif
-
-
-/****************************************************************************
-*  Function to terminate the program                                        *
-****************************************************************************/
-static bool abort_printMemory = true;
-static bool abort_printStack = true;
-static bool abort_throwException = false;
-static int force_exit = 0;
-void Utilities::setAbortBehavior( bool printMemory, bool printStack, bool throwException )
-{
-    abort_printMemory = printMemory;
-    abort_printStack = printStack;
-    abort_throwException = throwException;
-}
-void Utilities::abort(const std::string &message, const std::string &filename, const int line) 
-{
-    std::stringstream msg;
-    msg << "Program abort called in file `" << filename << "' at line " << line << std::endl;
-    // Add the memory usage and call stack to the error message
-    if ( abort_printMemory ) {
-        size_t N_bytes = Utilities::getMemoryUsage();
-        msg << "Bytes used = " << N_bytes << std::endl;
-    }
-    if ( abort_printStack ) {
-        std::vector<StackTrace::stack_info> stack = StackTrace::getCallStack();
-        msg << std::endl;
-        msg << "Stack Trace:\n";
-        for (size_t i=0; i<stack.size(); i++)
-            msg << "   " << stack[i].print() << std::endl;
-    }
-    msg << std::endl << message << std::endl;
-    // Print the message and abort
-    if ( force_exit>1 ) {
-        exit(-1);
-    } else if ( !abort_throwException ) {
-        // Use MPI_abort (will terminate all processes)
-        force_exit = 2;
-        std::cerr << msg.str();
-        #if defined(USE_MPI) || defined(HAVE_MPI)
-            int initialized=0, finalized=0;
-            MPI_Initialized(&initialized);
-            MPI_Finalized(&finalized);
-            if ( initialized!=0 && finalized==0 )
-                MPI_Abort(MPI_COMM_WORLD,-1);
-        #endif
-        exit(-1);
-    } else if ( force_exit>0 ) {
-        exit(-1);
-    } else {
-        // Throw and standard exception (allows the use of try, catch)
-        throw std::logic_error(msg.str());
-    }
-}
-
-
-/****************************************************************************
-*  Function to handle MPI errors                                            *
-****************************************************************************/
-/*#if defined(USE_MPI) || defined(HAVE_MPI)
-MPI_Errhandler mpierr;
-void MPI_error_handler_fun( MPI_Comm *comm, int *err, ... )
-{
-    if ( *err==MPI_ERR_COMM && *comm==MPI_COMM_WORLD ) {
-        // Special error handling for an invalid MPI_COMM_WORLD
-        std::cerr << "Error invalid MPI_COMM_WORLD";
-        exit(-1);
-    }
-    int msg_len=0;
-    char message[1000];
-    MPI_Error_string( *err, message, &msg_len );
-    if ( msg_len <= 0 )
-         abort("Unkown error in MPI");
-    abort( "Error calling MPI routine:\n" + std::string(message) );
-}
-#endif*/
-
-
-/****************************************************************************
-*  Function to handle unhandled exceptions                                  *
-****************************************************************************/
-bool tried_MPI_Abort=false;
-void term_func_abort(int err) 
-{
-    printf("Exiting due to abort (%i)\n",err);
-    std::vector<StackTrace::stack_info> stack = StackTrace::getCallStack();
-    std::string message = "Stack Trace:\n";
-    for (size_t i=0; i<stack.size(); i++)
-        message += "   " + stack[i].print() += "\n";
-    message += "\nExiting\n";
-    // Print the message and abort
-    std::cerr << message;
-    #ifdef USE_MPI
-        if ( !abort_throwException && !tried_MPI_Abort ) {
-            tried_MPI_Abort = true;
-            MPI_Abort(MPI_COMM_WORLD,-1);
-        }
-    #endif
-    exit(-1);
-}
-#if defined(USE_LINUX) || defined(USE_MAC)
-    static int tried_throw = 0;
-#endif
-void term_func() 
-{
-    // Try to re-throw the last error to get the last message
-    std::string last_message;
-    #if defined(USE_LINUX) || defined(USE_MAC)
-        try {
-            if ( tried_throw==0 ) { 
-                tried_throw = 1;
-                throw;
-            }
-            // No active exception
-        } catch (const std::exception &err) {
-            // Caught a std::runtime_error
-            last_message = err.what();
-        } catch (...) {
-            // Caught an unknown exception
-            last_message = "unknown exception occurred.";
-        }
-    #endif
-    std::stringstream msg;
-    msg << "Unhandled exception:" << std::endl;
-    msg << "   " << last_message << std::endl;
-    Utilities::abort( msg.str(), __FILE__, __LINE__ );
-}
-
-
-/****************************************************************************
-*  Functions to set the error handler                                       *
-****************************************************************************/
-static void setTerminateErrorHandler()
-{
-    std::set_terminate( term_func );
-    signal(SIGABRT,&term_func_abort);
-    signal(SIGFPE,&term_func_abort);
-    signal(SIGILL,&term_func_abort);
-    signal(SIGINT,&term_func_abort);
-    signal(SIGSEGV,&term_func_abort);
-    signal(SIGTERM,&term_func_abort);
-}
-void Utilities::setErrorHandlers()
-{
-    //d_use_MPI_Abort = use_MPI_Abort;
-    //setMPIErrorHandler( SAMRAI::tbox::SAMRAI_MPI::getSAMRAIWorld() );
-    setTerminateErrorHandler();
-}
-/*void Utilities::setMPIErrorHandler( const SAMRAI::tbox::SAMRAI_MPI& mpi )
-{
-    #if defined(USE_MPI) || defined(HAVE_MPI)
-        if ( mpierr.get()==NULL ) {
-            mpierr = boost::shared_ptr<MPI_Errhandler>( new MPI_Errhandler );
-            MPI_Comm_create_errhandler( MPI_error_handler_fun, mpierr.get() );
-        }
-        MPI_Comm_set_errhandler( mpi.getCommunicator(), *mpierr );
-        MPI_Comm_set_errhandler( MPI_COMM_WORLD, *mpierr );
-    #endif
-}
-void Utilities::clearMPIErrorHandler(  )
-{
-    #if defined(USE_MPI) || defined(HAVE_MPI)
-        if ( mpierr.get()!=NULL )
-            MPI_Errhandler_free( mpierr.get() );    // Delete the error handler
-        mpierr.reset();
-        MPI_Comm_set_errhandler( MPI_COMM_SELF, MPI_ERRORS_ARE_FATAL );
-        MPI_Comm_set_errhandler( MPI_COMM_WORLD, MPI_ERRORS_ARE_FATAL );
-    #endif
-}*/
-
-
-/****************************************************************************
-*  Function to get the memory usage                                         *
-*  Note: this function should be thread-safe                                *
-****************************************************************************/
-#if defined(USE_MAC)
-    // Get the page size on mac
-    static size_t page_size = static_cast<size_t>(sysconf(_SC_PAGESIZE));
-#endif
-static size_t N_bytes_initialization = Utilities::getMemoryUsage();
-size_t Utilities::getMemoryUsage()
-{
-    size_t N_bytes = 0;
-    #if defined(USE_LINUX)
-        struct mallinfo meminfo = mallinfo();
-        size_t size_hblkhd = static_cast<unsigned int>( meminfo.hblkhd );
-        size_t size_uordblks = static_cast<unsigned int>( meminfo.uordblks );
-        N_bytes = size_hblkhd + size_uordblks;
-    #elif defined(USE_MAC)
-        struct task_basic_info t_info;
-        mach_msg_type_number_t t_info_count = TASK_BASIC_INFO_COUNT;
-        if (KERN_SUCCESS != task_info(mach_task_self(),
-                              TASK_BASIC_INFO, (task_info_t)&t_info, 
-                              &t_info_count)) {
-            return 0;
-        }
-        N_bytes = t_info.virtual_size;
-    #elif defined(USE_WINDOWS)
-        PROCESS_MEMORY_COUNTERS memCounter;
-        GetProcessMemoryInfo( GetCurrentProcess(), &memCounter, sizeof(memCounter) );
-        N_bytes = memCounter.WorkingSetSize;
-    #endif
-    return N_bytes;
-}
-
-
-/****************************************************************************
-*  Functions to get the time and timer resolution                           *
-****************************************************************************/
-#if defined(USE_WINDOWS)
-    double Utilities::time() 
-    { 
-        LARGE_INTEGER end, f;
-        QueryPerformanceFrequency(&f);
-        QueryPerformanceCounter(&end);       
-        double time = ((double)end.QuadPart)/((double)f.QuadPart);
-        return time;
-    }
-    double Utilities::tick() 
-    { 
-        LARGE_INTEGER f;
-        QueryPerformanceFrequency(&f);
-        double resolution = ((double)1.0)/((double)f.QuadPart);
-        return resolution;
-    }
-#elif defined(USE_LINUX) || defined(USE_MAC)
-    double Utilities::time() 
-    { 
-        timeval current_time;
-        gettimeofday(&current_time,NULL);
-        double time = ((double)current_time.tv_sec)+1e-6*((double)current_time.tv_usec);
-        return time;
-    }
-    double Utilities::tick() 
-    { 
-        timeval start, end;
-        gettimeofday(&start,NULL);
-        gettimeofday(&end,NULL);
-        while ( end.tv_sec==start.tv_sec &&  end.tv_usec==start.tv_usec )
-            gettimeofday(&end,NULL);
-        double resolution = ((double)(end.tv_sec-start.tv_sec))+1e-6*((double)(end.tv_usec-start.tv_usec));
-        return resolution;
-    }
-#else
-    #error Unknown OS
-#endif
-
 
 // Factor a number into it's prime factors
 std::vector<int> Utilities::factor(size_t number)
diff --git a/common/Utilities.h b/common/Utilities.h
index e6db4279..90cb4008 100644
--- a/common/Utilities.h
+++ b/common/Utilities.h
@@ -1,91 +1,42 @@
 #ifndef included_Utilities
 #define included_Utilities
 
-#include <chrono>
 #include <cstdarg>
-#include <iostream>
-#include <mutex>
-#include <stdio.h>
-#include <stdlib.h>
-#include <sys/stat.h>
-#include <thread>
 #include <vector>
 
+#include "StackTrace/Utilities.h"
+
 
 namespace Utilities {
 
 
-/*!
- * Aborts the run after printing an error message with file and
- * linenumber information.
- */
-void abort( const std::string &message, const std::string &filename, const int line );
-
-
-/*!
- * Set the behavior of abort
- * @param printMemory       Print the current memory usage (default is true)
- * @param printStack        Print the current call stack (default is true)
- * @param throwException    Throw an exception instead of MPI_Abort (default is false)
- */
-void setAbortBehavior( bool printMemory, bool printStack, bool throwException );
-
-//! Function to set the error handlers
-void setErrorHandlers();
-
-
-/*!
- * Function to get the memory availible.
- * This function will return the total memory availible
- * Note: depending on the implimentation, this number may be rounded to
- * to a multiple of the page size.
- * If this function fails, it will return 0.
- */
-size_t getSystemMemory();
-
-
-/*!
- * Function to get the memory usage.
- * This function will return the total memory used by the application.
- * Note: depending on the implimentation, this number may be rounded to
- * to a multiple of the page size.
- * If this function fails, it will return 0.
- */
-size_t getMemoryUsage();
-
-
-//! Function to get an arbitrary point in time
-double time();
-
-
-//! Function to get the resolution of time
-double tick();
+// Functions inherited from StackTrace::Utilities
+using StackTrace::Utilities::abort;
+using StackTrace::Utilities::cause_segfault;
+using StackTrace::Utilities::clearErrorHandlers;
+using StackTrace::Utilities::exec;
+using StackTrace::Utilities::getMemoryUsage;
+using StackTrace::Utilities::getSystemMemory;
+using StackTrace::Utilities::setAbortBehavior;
+using StackTrace::Utilities::setErrorHandlers;
+using StackTrace::Utilities::tick;
+using StackTrace::Utilities::time;
+using StackTrace::Utilities::sleep_ms;
+using StackTrace::Utilities::sleep_s;
 
 
 //! std::string version of sprintf
 inline std::string stringf( const char *format, ... );
 
 
-/*!
- * Sleep for X ms
- * @param N         Time to sleep (ms)
- */
-inline void sleep_ms( int N ) { std::this_thread::sleep_for( std::chrono::milliseconds( N ) ); }
-
-
-/*!
- * Sleep for X s
- * @param N         Time to sleep (s)
- */
-inline void sleep_s( int N ) { std::this_thread::sleep_for( std::chrono::seconds( N ) ); }
-
-
 //! Factor a number into it's prime factors
 std::vector<int> factor(size_t number);
 
-//! Print AMP Banner
+
+//! Null use function
 void nullUse( void* );
 
+
 } // namespace Utilities
 
 
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index af648c76..c6775e68 100755
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -64,7 +64,6 @@ ADD_LBPM_TEST_1_2_4( TestBlobIdentify )
 ADD_LBPM_TEST_PARALLEL( TestSegDist 8 )
 ADD_LBPM_TEST_PARALLEL( TestCommD3Q19 8 )
 ADD_LBPM_TEST_1_2_4( testCommunication )
-ADD_LBPM_TEST_1_2_4( testUtilities )
 ADD_LBPM_TEST( TestWriter )
 IF ( USE_NETCDF )
     ADD_LBPM_TEST_PARALLEL( TestNetcdf 8 )
diff --git a/tests/TestWriter.cpp b/tests/TestWriter.cpp
index 855f33f6..78dab50b 100644
--- a/tests/TestWriter.cpp
+++ b/tests/TestWriter.cpp
@@ -4,8 +4,8 @@
 #include <exception>
 #include <stdexcept>
 #include <fstream>
+#include <memory>
 
-#include "shared_ptr.h"
 #include "common/UnitTest.h"
 #include "common/Utilities.h"
 #include "common/MPI_Helpers.h"
diff --git a/tests/testUtilities.cpp b/tests/testUtilities.cpp
deleted file mode 100644
index b084a695..00000000
--- a/tests/testUtilities.cpp
+++ /dev/null
@@ -1,145 +0,0 @@
-#include <iostream>
-#include <vector>
-#include <stdio.h>
-#include <stdlib.h>
-#include <time.h>
-#include <sys/stat.h>
-#include <math.h>
-#include <stdexcept>
-#include <string.h>
-#include <stdint.h>
-
-#include "common/Utilities.h"
-#include "common/StackTrace.h"
-#include "common/UnitTest.h"
-#include "common/MPI_Helpers.h"
-
-
-// Detect the OS (defines which tests we allow to fail)
-#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) || defined(_MSC_VER)
-    #define USE_WINDOWS
-#elif defined(__APPLE__)
-    #define USE_MAC
-#elif defined( __linux ) || defined( __linux__ ) || defined( __unix ) || defined( __posix )
-    #define USE_LINUX
-#else
-    #error Unknown OS
-#endif
-
-
-// Function to return the call stack
-std::vector<std::string> get_call_stack() 
-{
-    std::vector<StackTrace::stack_info> stack = StackTrace::getCallStack();
-    std::vector<std::string> stack2(stack.size());
-    for (size_t i=0; i<stack.size(); i++)
-        stack2[i] = stack[i].print();
-    // Trick compiler to skip inline for this function with fake recursion
-    if ( stack.size() > 10000 ) { stack2 = get_call_stack(); } 
-    return stack2;
-}
-
-
-// The main function
-int main(int argc, char *argv[]) 
-{
-    int rank = 0;
-    MPI_Init(&argc,&argv);
-    MPI_Comm comm = MPI_COMM_WORLD;
-    MPI_Comm_rank(comm,&rank);
-    UnitTest ut;
-    Utilities::setAbortBehavior( true, true, true );
-
-    // Limit the scope of variables
-    { 
-        // Test the memory usage
-        double t0 = Utilities::time();
-        size_t n_bytes1 = Utilities::getMemoryUsage();
-        double time1 = Utilities::time() - t0;
-        uint64_t *tmp = new uint64_t[0x100000];
-        memset(tmp,0xAA,0x100000*sizeof(uint64_t));
-        Utilities::nullUse( tmp );
-        t0 = Utilities::time();
-        size_t n_bytes2 = Utilities::getMemoryUsage();
-        double time2 = Utilities::time() - t0;
-        delete [] tmp;
-        t0 = Utilities::time();
-        size_t n_bytes3 = Utilities::getMemoryUsage();
-        double time3 = Utilities::time() - t0;
-        std::cout << "Number of bytes used for a basic test: " << n_bytes1 << ", " << n_bytes2 << ", " << n_bytes3 << std::endl;
-        std::cout << "   Time to query: " << time1*1e6 << " us, " << time2*1e6 << " us, " << time3*1e6 << " us" << std::endl;
-        if ( n_bytes1==0 ) {
-            ut.failure("getMemoryUsage returns 0");
-        } else {
-            ut.passes("getMemoryUsage returns non-zero");
-            if ( n_bytes2>n_bytes1 ) {
-                ut.passes("getMemoryUsage increases size");
-            } else {
-                #if defined(USE_MAC)
-                    ut.expected_failure("getMemoryUsage does not increase size");
-                #else
-                    ut.failure("getMemoryUsage increases size");
-                #endif
-            }
-            if ( n_bytes1==n_bytes3 ) {
-                ut.passes("getMemoryUsage decreases size properly");
-            } else {
-                #if defined(USE_MAC) || defined(USE_WINDOWS)
-                    ut.expected_failure("getMemoryUsage does not decrease size properly");
-                #else
-                    ut.failure("getMemoryUsage does not decrease size properly");
-                #endif
-            }
-        }
-
-        // Test getting the current call stack
-        std::vector<std::string> call_stack = get_call_stack();
-        if ( rank==0 ) {
-            std::cout << "Call stack:" << std::endl;
-            for (size_t i=0; i<call_stack.size(); i++)
-                std::cout << "   " << call_stack[i] << std::endl;
-        }
-        if ( !call_stack.empty() ) {
-            ut.passes("non empty call stack");
-            if ( call_stack[0].find("get_call_stack()") != std::string::npos )
-                ut.passes("call stack decoded function symbols");
-            else
-                ut.expected_failure("call stack was unable to decode function symbols");
-        } else {
-            ut.failure("non empty call stack");
-        }
-
-        // Test catching an error
-        try {
-            ERROR("Test");
-            ut.failure("Failed to catch RAY_ERROR");
-        } catch (...) {
-            ut.passes("Caught RAY_ERROR");
-        }
-        try {
-            throw std::logic_error("test");
-            ut.failure("Failed to catch exception");
-        } catch (...) {
-            ut.passes("Caught exception");
-        }
-        
-        // Test time/tick
-        double time = Utilities::time();
-        double res = Utilities::tick();
-        if ( time==0 || res==0 )
-            ut.failure("time/tick");
-        else
-            ut.passes("time/tick");
-
-    }
-
-    // Finished
-    ut.report();
-    size_t N_errors = ut.NumFailGlobal();
-    if ( N_errors==0 )
-        printf("All tests passed\n");
-    MPI_Finalize();
-    return (int) N_errors;
-}
-
-
diff --git a/threadpool/Readme.txt b/threadpool/Readme.txt
new file mode 100644
index 00000000..2d0a661c
--- /dev/null
+++ b/threadpool/Readme.txt
@@ -0,0 +1,2 @@
+This directory contains code external code released with permission under the license of this project.
+
diff --git a/threadpool/atomic_helpers.cpp b/threadpool/atomic_helpers.cpp
index 574cd30e..327cb3f5 100644
--- a/threadpool/atomic_helpers.cpp
+++ b/threadpool/atomic_helpers.cpp
@@ -25,5 +25,43 @@ static int create_atomic_pthread_lock()
 int atomic_pthread_lock_initialized = create_atomic_pthread_lock();
 #endif
 
+
+// Atomic operations for floating types
+double atomic_add( double volatile *x, double y )
+{
+    static_assert( sizeof( double ) == sizeof( int64_atomic ), "Unexpected size" );
+    union U {
+        double d;
+        int64_atomic i;
+    };
+    U a, b;
+    bool swap = false;
+    auto x2   = reinterpret_cast<int64_atomic volatile *>( x );
+    while ( !swap ) {
+        a.i  = atomic_add( x2, 0 );
+        b.d  = a.d + y;
+        swap = atomic_compare_and_swap( x2, a.i, b.i );
+    }
+    return b.d;
+}
+float atomic_add( float volatile *x, float y )
+{
+    static_assert( sizeof( float ) == sizeof( int32_atomic ), "Unexpected size" );
+    union U {
+        float d;
+        int32_atomic i;
+    };
+    U a, b;
+    bool swap = false;
+    auto x2   = reinterpret_cast<int32_atomic volatile *>( x );
+    while ( !swap ) {
+        a.i  = atomic_add( x2, 0 );
+        b.d  = a.d + y;
+        swap = atomic_compare_and_swap( x2, a.i, b.i );
+    }
+    return b.d;
+}
+
+
 } // AtomicOperations namespace
 
diff --git a/threadpool/atomic_helpers.h b/threadpool/atomic_helpers.h
index e1eec545..32b67200 100644
--- a/threadpool/atomic_helpers.h
+++ b/threadpool/atomic_helpers.h
@@ -2,6 +2,8 @@
 // but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
 #ifndef included_ThreadPoolAtomicHelpers
 #define included_ThreadPoolAtomicHelpers
+
+#include <stdexcept>
 #include <stdint.h>
 #include <stdio.h>
 #include <typeinfo>
@@ -10,7 +12,6 @@
 #if defined( WIN32 ) || defined( _WIN32 ) || defined( WIN64 ) || defined( _WIN64 )
 // Using windows
 #define USE_WINDOWS
-#define NOMINMAX
 #include <process.h>
 #include <stdlib.h>
 #include <windows.h>
@@ -529,6 +530,11 @@ inline void atomic_swap( int64_atomic volatile *x, int64_atomic *y )
 }
 
 
+// Atomic operations for floating types
+double atomic_add( double volatile *x, double y );
+float atomic_add( float volatile *x, float y );
+
+
 // Define an atomic counter
 struct counter_t {
 public:
diff --git a/threadpool/atomic_list.h b/threadpool/atomic_list.h
index 5da8cc85..c60c2869 100644
--- a/threadpool/atomic_list.h
+++ b/threadpool/atomic_list.h
@@ -14,12 +14,16 @@
  * \details This class implements a basic sorted list that is thread-safe and lock-free.
  *    Entries are stored smallest to largest according to the compare operator
  */
-template<class TYPE, int MAX_SIZE, class COMPARE = std::less<TYPE>>
+template<class TYPE, class COMPARE = std::less<TYPE>>
 class AtomicList final
 {
 public:
     //! Default constructor
-    AtomicList( const TYPE &default_value = TYPE(), const COMPARE &comp = COMPARE() );
+    AtomicList( size_t capacity = 1024, const TYPE &default_value = TYPE(),
+        const COMPARE &comp = COMPARE() );
+
+    //! Destructor
+    ~AtomicList();
 
     /*!
      * \brief   Remove an item from the list
@@ -33,8 +37,8 @@ public:
      *                      bool cmp( const TYPE& value, ... );
      * @param args      Additional arguments for the comparison
      */
-    template<class Compare, class... Args>
-    inline TYPE remove( Compare compare, Args... args );
+    template<typename Compare, class... Args>
+    inline TYPE remove( Compare compare, const Args &... args );
 
     //! Remove the first from the list
     inline TYPE remove_first();
@@ -44,13 +48,13 @@ public:
      * \details Insert an item into the list
      * @param x         Item to insert
      */
-    inline void insert( TYPE x );
+    inline void insert( const TYPE &x );
 
     /*!
      * \brief   Return the size of the list
      * \details Return the number of items in the list
      */
-    inline int size() const { return AtomicOperations::atomic_get( &d_N ); }
+    inline size_t size() const { return AtomicOperations::atomic_get( &d_N ); }
 
     /*!
      * \brief   Check if the list is empty
@@ -58,11 +62,23 @@ public:
      */
     inline bool empty() const { return AtomicOperations::atomic_get( &d_N ) == 0; }
 
+    /*!
+     * \brief   Clear the list
+     * \details Removes all entries from the list
+     */
+    inline void clear()
+    {
+        while ( !empty() ) {
+            remove_first();
+        }
+    }
+
     /*!
      * \brief   Return the capacity of the list
      * \details Return the maximum number of items the list can hold
      */
-    inline int capacity() const { return MAX_SIZE; }
+    inline constexpr size_t capacity() const { return d_capacity; }
+
 
     /*!
      * \brief   Check the list
@@ -76,19 +92,20 @@ public:
 
 
     //! Return the total number of inserts since object creation
-    inline int64_t N_insert() const { return AtomicOperations::atomic_get( &d_N_insert ); }
+    inline size_t N_insert() const { return AtomicOperations::atomic_get( &d_N_insert ); }
 
 
     //! Return the total number of removals since object creation
-    inline int64_t N_remove() const { return AtomicOperations::atomic_get( &d_N_remove ); }
+    inline size_t N_remove() const { return AtomicOperations::atomic_get( &d_N_remove ); }
 
 private:
     // Data members
     COMPARE d_compare;
+    const size_t d_capacity;
     volatile TYPE d_default;
-    volatile TYPE d_objects[MAX_SIZE];
+    volatile TYPE *d_objects;
     volatile AtomicOperations::int32_atomic d_N;
-    volatile AtomicOperations::int32_atomic d_next[MAX_SIZE + 1];
+    volatile AtomicOperations::int32_atomic *d_next;
     volatile AtomicOperations::int32_atomic d_unused;
     volatile AtomicOperations::int64_atomic d_N_insert;
     volatile AtomicOperations::int64_atomic d_N_remove;
@@ -99,8 +116,9 @@ private:
         if ( i == -1 )
             return -1;
         int tmp = 0;
-        while ( tmp == 0 )
+        do {
             tmp = AtomicOperations::atomic_fetch_and_and( &d_next[i], 0 );
+        } while ( tmp == 0 );
         return tmp;
     }
     inline void unlock( int i, int value )
@@ -111,8 +129,9 @@ private:
     inline int get_unused()
     {
         int i = 0;
-        while ( i == 0 )
+        do {
             i = AtomicOperations::atomic_fetch_and_and( &d_unused, 0 );
+        } while ( i == 0 );
         AtomicOperations::atomic_fetch_and_or( &d_unused, -( d_next[i] + 4 ) + 1 );
         d_next[i] = -3;
         return i;
@@ -120,16 +139,17 @@ private:
     inline void put_unused( int i )
     {
         int j = 0;
-        while ( j == 0 )
+        do {
             AtomicOperations::atomic_swap( &d_unused, &j );
+        } while ( j == 0 );
         d_next[i] = -3 - j;
         AtomicOperations::atomic_fetch_and_or( &d_unused, i );
     }
 
 
-private:
-    AtomicList( const AtomicList & );
-    AtomicList &operator=( const AtomicList & );
+public:
+    AtomicList( const AtomicList & ) = delete;
+    AtomicList &operator=( const AtomicList & ) = delete;
 };
 
 
diff --git a/threadpool/atomic_list.hpp b/threadpool/atomic_list.hpp
index a0850971..3a4df598 100644
--- a/threadpool/atomic_list.hpp
+++ b/threadpool/atomic_list.hpp
@@ -10,28 +10,39 @@
 /******************************************************************
  * Constructor                                                     *
  ******************************************************************/
-template<class TYPE, int MAX_SIZE, class COMPARE>
-AtomicList<TYPE, MAX_SIZE, COMPARE>::AtomicList( const TYPE &default_value, const COMPARE &comp )
-    : d_compare( comp ), d_default( default_value )
+template<class TYPE, class COMPARE>
+AtomicList<TYPE, COMPARE>::AtomicList(
+    size_t capacity, const TYPE &default_value, const COMPARE &comp )
+    : d_compare( comp ),
+      d_capacity( capacity ),
+      d_default( default_value ),
+      d_objects( new TYPE[capacity] ),
+      d_N( 0 ),
+      d_next( new AtomicOperations::int32_atomic[capacity + 1] ),
+      d_unused( 1 ),
+      d_N_insert( 0 ),
+      d_N_remove( 0 )
 {
-    d_N        = 0;
-    d_next[0]  = -1;
-    d_unused   = 1;
-    d_N_insert = 0;
-    d_N_remove = 0;
-    for ( int i = 0; i < MAX_SIZE; i++ ) {
+    d_next[0] = -1;
+    for ( size_t i = 0; i < d_capacity; i++ ) {
         d_next[i + 1] = -5 - i;
         d_objects[i]  = d_default;
     }
 }
+template<class TYPE, class COMPARE>
+AtomicList<TYPE, COMPARE>::~AtomicList()
+{
+    delete[] d_objects;
+    delete[] d_next;
+}
 
 
 /******************************************************************
  * Remove an item                                                  *
  ******************************************************************/
-template<class TYPE, int MAX_SIZE, class COMPARE>
+template<class TYPE, class COMPARE>
 template<class Compare, class... Args>
-inline TYPE AtomicList<TYPE, MAX_SIZE, COMPARE>::remove( Compare compare, Args... args )
+inline TYPE AtomicList<TYPE, COMPARE>::remove( Compare compare, const Args &... args )
 {
     // Acquiring temporary ownership
     int pos   = 0;
@@ -50,8 +61,7 @@ inline TYPE AtomicList<TYPE, MAX_SIZE, COMPARE>::remove( Compare compare, Args..
         // Test to see if the object passes compare
         bool test = compare( const_cast<TYPE &>( d_objects[next - 1] ), args... );
         if ( test ) {
-            // We want to return this object, update next to point to another entry and remove the
-            // entry
+            // We want to return this object, update next to point to another entry and remove
             unlock( next, -3 );
             unlock( pos, next2 );
             pos = next;
@@ -71,8 +81,8 @@ inline TYPE AtomicList<TYPE, MAX_SIZE, COMPARE>::remove( Compare compare, Args..
     }
     return rtn;
 }
-template<class TYPE, int MAX_SIZE, class COMPARE>
-inline TYPE AtomicList<TYPE, MAX_SIZE, COMPARE>::remove_first()
+template<class TYPE, class COMPARE>
+inline TYPE AtomicList<TYPE, COMPARE>::remove_first()
 {
     TYPE rtn( d_default );
     auto next = lock( 0 );
@@ -94,11 +104,11 @@ inline TYPE AtomicList<TYPE, MAX_SIZE, COMPARE>::remove_first()
 /******************************************************************
  * Insert an item                                                  *
  ******************************************************************/
-template<class TYPE, int MAX_SIZE, class COMPARE>
-inline void AtomicList<TYPE, MAX_SIZE, COMPARE>::insert( TYPE x )
+template<class TYPE, class COMPARE>
+inline void AtomicList<TYPE, COMPARE>::insert( const TYPE &x )
 {
-    int N_used = AtomicOperations::atomic_increment( &d_N );
-    if ( N_used > MAX_SIZE ) {
+    size_t N_used = AtomicOperations::atomic_increment( &d_N );
+    if ( N_used > d_capacity ) {
         AtomicOperations::atomic_decrement( &d_N );
         throw std::logic_error( "No room in list" );
     }
@@ -141,8 +151,8 @@ inline void AtomicList<TYPE, MAX_SIZE, COMPARE>::insert( TYPE x )
  * Check the internal structures of the list                       *
  * This is mostly thread-safe, but blocks all threads              *
  ******************************************************************/
-template<class TYPE, int MAX_SIZE, class COMPARE>
-inline bool AtomicList<TYPE, MAX_SIZE, COMPARE>::check()
+template<class TYPE, class COMPARE>
+inline bool AtomicList<TYPE, COMPARE>::check()
 {
     // Get the lock and check for any other threads modifying the list
     auto start = lock( 0 );
@@ -153,11 +163,11 @@ inline bool AtomicList<TYPE, MAX_SIZE, COMPARE>::check()
     int N2       = 0;
     int N_unused = 0;
     int N_tail   = 0;
-    for ( int i = 0; i < MAX_SIZE; i++ ) {
+    for ( size_t i = 0; i < d_capacity; i++ ) {
         if ( d_objects[i] != d_default )
             N1++;
     }
-    for ( int i = 0; i < MAX_SIZE + 1; i++ ) {
+    for ( size_t i = 0; i <= d_capacity; i++ ) {
         int next = i == 0 ? start : d_next[i];
         if ( next > 0 ) {
             N2++;
@@ -169,7 +179,7 @@ inline bool AtomicList<TYPE, MAX_SIZE, COMPARE>::check()
             pass = false;
         }
     }
-    pass    = pass && N_tail == 1 && N1 == d_N && N2 == d_N && N_unused + d_N == MAX_SIZE;
+    pass    = pass && N_tail == 1 && N1 == d_N && N2 == d_N && N_unused + d_N == (int) d_capacity;
     int it  = 0;
     int pos = 0;
     while ( true ) {
diff --git a/threadpool/test/CMakeLists.txt b/threadpool/test/CMakeLists.txt
deleted file mode 100644
index 90490864..00000000
--- a/threadpool/test/CMakeLists.txt
+++ /dev/null
@@ -1,16 +0,0 @@
-# Add thread pool tests
-ADD_LBPM_TEST( test_atomic )
-ADD_LBPM_TEST( test_atomic_list )
-SET_TESTS_PROPERTIES ( test_atomic PROPERTIES FAIL_REGULAR_EXPRESSION ".*FAILED.*" PROCESSORS 64 )
-ADD_LBPM_TEST_THREAD_MPI( test_thread_pool 1 4 )
-ADD_LBPM_TEST_THREAD_MPI( test_thread_pool 2 4 )
-ADD_LBPM_TEST_THREAD_MPI( test_thread_pool 4 4 )
-SET_PROPERTY( TEST test_thread_pool_1procs_4threads APPEND PROPERTY RUN_SERIAL 1 )
-IF ( USE_MPI )
-    SET_PROPERTY( TEST test_thread_pool_2procs_4threads APPEND PROPERTY RUN_SERIAL 1 )
-    SET_PROPERTY( TEST test_thread_pool_4procs_4threads APPEND PROPERTY RUN_SERIAL 1 )
-ENDIF()
-
-
-
-
diff --git a/threadpool/test/test_atomic.cpp b/threadpool/test/test_atomic.cpp
deleted file mode 100644
index 27c76ee1..00000000
--- a/threadpool/test/test_atomic.cpp
+++ /dev/null
@@ -1,154 +0,0 @@
-#include "threadpool/atomic_helpers.h"
-#include "common/UnitTest.h"
-#include "common/Utilities.h"
-#include <atomic>
-#include <chrono>
-#include <cstdio>
-#include <cstdlib>
-#include <functional>
-#include <iostream>
-#include <string>
-#include <thread>
-#include <vector>
-
-
-#define perr std::cerr
-#define pout std::cout
-#define printp printf
-
-
-// Function to increment/decrement a counter N times
-static void modify_counter( int N, AtomicOperations::counter_t &counter )
-{
-    if ( N > 0 ) {
-        for ( int i = 0; i < N; i++ )
-            counter.increment();
-    } else if ( N < 0 ) {
-        for ( int i = 0; i < -N; i++ )
-            counter.decrement();
-    }
-}
-
-
-/******************************************************************
- * The main program                                                *
- ******************************************************************/
-#ifdef USE_WINDOWS
-int __cdecl main( int, char ** )
-{
-#elif defined( USE_LINUX ) || defined( USE_MAC )
-int main( int, char *[] )
-{
-#else
-#error Unknown OS
-#endif
-    UnitTest ut;
-
-    int N_threads = 64;      // Number of threads
-    int N_count   = 1000000; // Number of work items
-
-// Ensure we are using all processors
-#ifdef __USE_GNU
-    int N_procs = sysconf( _SC_NPROCESSORS_ONLN );
-    cpu_set_t mask;
-    CPU_ZERO( &mask );
-    for ( int i = 0; i < N_procs; i++ )
-        CPU_SET( i, &mask );
-    sched_setaffinity( getpid(), sizeof( cpu_set_t ), &mask );
-#endif
-
-    // Create the counter we want to test
-    AtomicOperations::counter_t count;
-    if ( count.increment() == 1 )
-        ut.passes( "increment count" );
-    else
-        ut.failure( "increment count" );
-    if ( count.decrement() == 0 )
-        ut.passes( "decrement count" );
-    else
-        ut.failure( "decrement count" );
-    count.setCount( 3 );
-    if ( count.getCount() == 3 )
-        ut.passes( "set count" );
-    else
-        ut.failure( "set count" );
-    count.setCount( 0 );
-
-    // Increment the counter in serial
-    auto start = std::chrono::high_resolution_clock::now();
-    modify_counter( N_count, count );
-    auto stop              = std::chrono::high_resolution_clock::now();
-    double time_inc_serial = std::chrono::duration<double>( stop - start ).count() / N_count;
-    int val                = count.getCount();
-    if ( val != N_count ) {
-        char tmp[100];
-        sprintf( tmp, "Count of %i did not match expected count of %i", val, N_count );
-        ut.failure( tmp );
-    }
-    printp( "Time to increment (serial) = %0.1f ns\n", 1e9 * time_inc_serial );
-
-    // Decrement the counter in serial
-    start = std::chrono::high_resolution_clock::now();
-    modify_counter( -N_count, count );
-    stop                   = std::chrono::high_resolution_clock::now();
-    double time_dec_serial = std::chrono::duration<double>( stop - start ).count() / N_count;
-    val                    = count.getCount();
-    if ( val != 0 ) {
-        char tmp[100];
-        sprintf( tmp, "Count of %i did not match expected count of %i", val, 0 );
-        ut.failure( tmp );
-    }
-    printp( "Time to decrement (serial) = %0.1f ns\n", 1e9 * time_dec_serial );
-
-    // Increment the counter in parallel
-    std::vector<std::thread> threads( N_threads );
-    start = std::chrono::high_resolution_clock::now();
-    for ( int i = 0; i < N_threads; i++ )
-        threads[i] = std::thread( modify_counter, N_count, std::ref( count ) );
-    for ( int i = 0; i < N_threads; i++ )
-        threads[i].join();
-    stop = std::chrono::high_resolution_clock::now();
-    double time_inc_parallel =
-        std::chrono::duration<double>( stop - start ).count() / ( N_count * N_threads );
-    val = count.getCount();
-    if ( val != N_count * N_threads ) {
-        char tmp[100];
-        sprintf( tmp, "Count of %i did not match expected count of %i", val, N_count * N_threads );
-        ut.failure( tmp );
-    }
-    printp( "Time to increment (parallel) = %0.1f ns\n", 1e9 * time_inc_parallel );
-
-    // Decrement the counter in parallel
-    start = std::chrono::high_resolution_clock::now();
-    for ( int i = 0; i < N_threads; i++ )
-        threads[i] = std::thread( modify_counter, -N_count, std::ref( count ) );
-    for ( int i = 0; i < N_threads; i++ )
-        threads[i].join();
-    stop = std::chrono::high_resolution_clock::now();
-    double time_dec_parallel =
-        std::chrono::duration<double>( stop - start ).count() / ( N_count * N_threads );
-    val = count.getCount();
-    if ( val != 0 ) {
-        char tmp[100];
-        sprintf( tmp, "Count of %i did not match expected count of %i", val, 0 );
-        ut.failure( tmp );
-    }
-    printp( "Time to decrement (parallel) = %0.1f ns\n", 1e9 * time_dec_parallel );
-
-    // Check the time to increment/decrement
-    if ( time_inc_serial > 100e-9 || time_dec_serial > 100e-9 || time_inc_parallel > 100e-9 ||
-         time_dec_serial > 100e-9 ) {
-#if USE_GCOV
-        ut.expected_failure( "Time to increment/decrement count is too expensive" );
-#else
-        ut.failure( "Time to increment/decrement count is too expensive" );
-#endif
-    } else {
-        ut.passes( "Time to increment/decrement passed" );
-    }
-
-    // Finished
-    ut.report();
-    auto N_errors = static_cast<int>( ut.NumFailGlobal() );
-    return N_errors;
-}
diff --git a/threadpool/test/test_atomic_list.cpp b/threadpool/test/test_atomic_list.cpp
deleted file mode 100644
index 4717dcc3..00000000
--- a/threadpool/test/test_atomic_list.cpp
+++ /dev/null
@@ -1,221 +0,0 @@
-#include "threadpool/atomic_list.h"
-#include "common/UnitTest.h"
-#include "common/Utilities.h"
-#include <algorithm>
-#include <atomic>
-#include <chrono>
-#include <cstdio>
-#include <cstdlib>
-#include <functional>
-#include <iostream>
-#include <string>
-#include <thread>
-#include <vector>
-
-
-
-static void modify_list( AtomicList<int, 1024> &list )
-{
-    const int N_count = 50000;
-    for ( int i = 0; i < N_count; i++ ) {
-        auto v1 = list.remove_first();
-        auto v2 = list.remove( []( int ) { return true; } );
-        auto v3 = list.remove( []( int v ) { return v >= ( rand() / 8 ); } );
-        auto v4 = list.remove( []( int v ) { return v >= ( rand() / 4 ); } );
-        auto v5 = list.remove( []( int v ) { return v >= ( rand() / 2 ); } );
-        if ( v1 != -1 ) {
-            list.insert( v1 );
-        }
-        if ( v2 != -1 ) {
-            list.insert( v2 );
-        }
-        if ( v3 != -1 ) {
-            list.insert( v3 );
-        }
-        if ( v4 != -1 ) {
-            list.insert( v4 );
-        }
-        if ( v5 != -1 ) {
-            list.insert( v5 );
-        }
-    }
-}
-
-
-static bool check_list( const std::vector<int> &x, AtomicList<int, 1024> &list )
-{
-    bool pass = list.check();
-    pass      = pass && (int) x.size() == list.size();
-    if ( pass ) {
-        for ( int i : x )
-            pass = pass && i == list.remove( []( int ) { return true; } );
-    }
-    // Restore the list
-    for ( int i = 0; i < list.size(); i++ )
-        list.remove_first();
-    for ( int i : x )
-        list.insert( i );
-    return pass;
-}
-
-
-static inline void clear_list( AtomicList<int, 1024> &list )
-{
-    for ( int i = 0; i < list.size(); i++ )
-        list.remove_first();
-}
-
-
-/******************************************************************
- * The main program                                                *
- ******************************************************************/
-int main( int, char *[] )
-{
-    UnitTest ut;
-
-    int N_threads = 8; // Number of threads
-
-    // Create the list
-    AtomicList<int, 1024> list( -1 );
-    if ( list.size() == 0 && list.check() )
-        ut.passes( "Initialize" );
-    else
-        ut.failure( "Initialize" );
-
-    // Initialize the list with some empty values
-    for ( int i = 0; i < 80; i++ )
-        list.insert( rand() );
-    list.insert( 2 );
-    list.insert( 1 );
-    list.insert( rand() );
-
-    // Try to pull off a couple of values
-    int v1 = list.remove( []( int a ) { return a == 1; } ); // Find the entry with 1
-    int v2 = list.remove( []( int ) { return true; } );     // Get the first entry
-    int v3 = list.remove( []( int ) { return false; } );    // Fail to get an entry
-    if ( v1 == 1 && v2 == 2 && v3 == -1 && list.size() == 81 && list.check() )
-        ut.passes( "Basic sanity test" );
-    else
-        ut.failure( "Basic sanity test" );
-
-    // Clear the list
-    while ( list.remove( []( int ) { return true; } ) != -1 ) {
-    }
-
-    // Create a list of known values
-    // std::vector<int> data0(512);
-    std::vector<int> data0( 5 * N_threads );
-    for ( int &i : data0 )
-        i = rand();
-    auto data = data0;
-    std::sort( data.begin(), data.end() );
-
-    // Test the cost to insert
-    int N_it = 20;
-    for ( int i = 0; i < list.size(); i++ )
-        list.remove( []( int ) { return true; } );
-    std::chrono::duration<double> time;
-    std::chrono::time_point<std::chrono::high_resolution_clock> start, stop;
-    time = time.zero();
-    for ( int it = 0; it < N_it; it++ ) {
-        clear_list( list );
-        start = std::chrono::high_resolution_clock::now();
-        for ( int i : data0 )
-            list.insert( i );
-        stop = std::chrono::high_resolution_clock::now();
-        time += ( stop - start );
-    }
-    printf( "insert time/item = %0.0f ns\n", 1e9 * time.count() / ( N_it * data0.size() ) );
-
-    // Test the cost to remove (first)
-    time = time.zero();
-    for ( int it = 0; it < N_it; it++ ) {
-        check_list( data, list );
-        start = std::chrono::high_resolution_clock::now();
-        for ( size_t i = 0; i < data0.size(); i++ )
-            list.remove_first();
-        stop = std::chrono::high_resolution_clock::now();
-        time += ( stop - start );
-    }
-    printf( "remove (first) time/item = %0.0f ns\n", 1e9 * time.count() / ( N_it * data0.size() ) );
-
-    // Test the cost to remove (in order)
-    time = time.zero();
-    for ( int it = 0; it < N_it; it++ ) {
-        check_list( data, list );
-        start = std::chrono::high_resolution_clock::now();
-        for ( size_t i = 0; i < data0.size(); i++ )
-            list.remove( []( int ) { return true; } );
-        stop = std::chrono::high_resolution_clock::now();
-        time += ( stop - start );
-    }
-    printf(
-        "remove (ordered) time/item = %0.0f ns\n", 1e9 * time.count() / ( N_it * data0.size() ) );
-
-    // Test the cost to remove (out order)
-    time = time.zero();
-    for ( int it = 0; it < N_it; it++ ) {
-        check_list( data, list );
-        start = std::chrono::high_resolution_clock::now();
-        for ( int tmp : data0 ) {
-            list.remove( [tmp]( int v ) { return v == tmp; } );
-        }
-        stop = std::chrono::high_resolution_clock::now();
-        time += ( stop - start );
-    }
-    printf(
-        "remove (unordered) time/item = %0.0f ns\n", 1e9 * time.count() / ( N_it * data0.size() ) );
-
-    // Read/write to the list and check the results
-    int64_t N0 = list.N_remove();
-    check_list( data, list );
-    start = std::chrono::high_resolution_clock::now();
-    modify_list( list );
-    stop               = std::chrono::high_resolution_clock::now();
-    double time_serial = std::chrono::duration<double>( stop - start ).count();
-    int64_t N1         = list.N_remove();
-    bool pass          = check_list( data, list );
-    if ( pass )
-        ut.passes( "Serial get/insert" );
-    else
-        ut.failure( "Serial get/insert" );
-    printf( "serial time = %0.5f s\n", time_serial );
-    printf( "serial time/item = %0.0f ns\n", 1e9 * time_serial / ( N1 - N0 ) );
-
-    // Have multiple threads reading/writing to the list simultaneously
-    std::vector<std::thread> threads( N_threads );
-    start = std::chrono::high_resolution_clock::now();
-    for ( int i = 0; i < N_threads; i++ )
-        threads[i] = std::thread( modify_list, std::ref( list ) );
-    for ( int i = 0; i < N_threads; i++ )
-        threads[i].join();
-    stop                 = std::chrono::high_resolution_clock::now();
-    double time_parallel = std::chrono::duration<double>( stop - start ).count();
-    int64_t N2           = list.N_remove();
-    pass                 = check_list( data, list );
-    if ( pass )
-        ut.passes( "Parallel get/insert" );
-    else
-        ut.failure( "Parallel get/insert" );
-    printf( "parallel time = %0.5f s\n", time_parallel );
-    printf( "parallel time/item = %0.0f ns\n", 1e9 * time_parallel / ( N2 - N1 ) );
-
-    // Try to over-fill the list
-    while ( !list.empty() )
-        list.remove_first();
-    for ( int i = 1; i <= list.capacity(); i++ )
-        list.insert( i );
-    try {
-        list.insert( list.capacity() + 1 );
-        ut.failure( "List overflow" );
-    } catch ( const std::exception &e ) {
-        ut.passes( "List overflow" );
-    } catch ( ... ) {
-        ut.failure( "List overflow (unknown exception)" );
-    }
-
-    // Finished
-    ut.report();
-    auto N_errors = static_cast<int>( ut.NumFailGlobal() );
-    return N_errors;
-}
diff --git a/threadpool/test/test_thread_pool.cpp b/threadpool/test/test_thread_pool.cpp
deleted file mode 100644
index b7168f4b..00000000
--- a/threadpool/test/test_thread_pool.cpp
+++ /dev/null
@@ -1,967 +0,0 @@
-#include "ProfilerApp.h"
-#ifdef USE_TIMER
-#include "MemoryApp.h"
-#endif
-#include "threadpool/thread_pool.h"
-#include "common/UnitTest.h"
-#include "common/Utilities.h"
-#include <algorithm>
-#include <cmath>
-#include <cstdio>
-#include <cstdlib>
-#include <iostream>
-#include <mutex>
-#include <stdexcept>
-#include <string>
-#include <vector>
-
-
-#define MAX( x, y ) ( ( x ) > ( y ) ? ( x ) : ( y ) )
-
-
-#define perr std::cerr
-#define pout std::cout
-#define printp printf
-
-
-#ifdef USE_MPI
-#include "mpi.h"
-#endif
-
-#define to_ns( x ) std::chrono::duration_cast<std::chrono::nanoseconds>( x ).count()
-#define to_ms( x ) std::chrono::duration_cast<std::chrono::milliseconds>( x ).count()
-
-
-// Wrapper functions for mpi
-static inline void barrier()
-{
-#ifdef USE_MPI
-    MPI_Barrier( MPI_COMM_WORLD );
-#endif
-}
-static inline int getRank()
-{
-    int rank = 0;
-#ifdef USE_MPI
-    MPI_Comm_rank( MPI_COMM_WORLD, &rank );
-#endif
-    return rank;
-}
-static inline int getSize()
-{
-    int size = 0;
-#ifdef USE_MPI
-    MPI_Comm_size( MPI_COMM_WORLD, &size );
-#endif
-    return size;
-}
-
-
-// Function to waste CPU cycles
-void waste_cpu( int N )
-{
-    if ( N > 10000 ) {
-        PROFILE_START( "waste_cpu", 2 );
-    }
-    double pi = 3.141592653589793;
-    double x  = 1.0;
-    N         = std::max( 10, N );
-    {
-        for ( int i = 0; i < N; i++ )
-            x = sqrt( x * exp( pi / x ) );
-    } // style to limit gcov hits
-    if ( fabs( x - 2.926064057273157 ) > 1e-12 ) {
-        abort();
-    }
-    if ( N > 10000 ) {
-        PROFILE_STOP( "waste_cpu", 2 );
-    }
-}
-
-
-// Sleep for the given time
-// Note: since we may encounter interrupts, we may not sleep for the desired time
-//   so we need to perform the sleep in a loop
-void sleep_ms( int64_t N )
-{
-    auto t1 = std::chrono::high_resolution_clock::now();
-    auto t2 = std::chrono::high_resolution_clock::now();
-    while ( to_ms( t2 - t1 ) < N ) {
-        int N2 = N - to_ms( t2 - t1 );
-        std::this_thread::sleep_for( std::chrono::milliseconds( N2 ) );
-        t2 = std::chrono::high_resolution_clock::now();
-    }
-}
-void sleep_s( int N ) { sleep_ms( 1000 * N ); }
-
-
-// Function to sleep for N seconds then increment a global count
-static volatile int global_sleep_count = 0;
-void sleep_inc( int N )
-{
-    PROFILE_START( "sleep_inc" );
-    sleep_s( N );
-    ++global_sleep_count;
-    PROFILE_STOP( "sleep_inc" );
-}
-void sleep_inc2( double x )
-{
-    sleep_ms( static_cast<int>( round( x * 1000 ) ) );
-    ++global_sleep_count;
-}
-void sleep_msg( double x, std::string msg )
-{
-    PROFILE_START( msg );
-    sleep_ms( static_cast<int>( round( x * 1000 ) ) );
-    NULL_USE( msg );
-    PROFILE_STOP( msg );
-}
-bool check_inc( int N ) { return global_sleep_count == N; }
-
-
-// Function to return the processor for the given thread
-std::mutex print_processor_mutex;
-
-void print_processor( ThreadPool *tpool )
-{
-    int rank = 0;
-#ifdef USE_MPI
-    MPI_Comm_rank( MPI_COMM_WORLD, &rank );
-#endif
-    int thread    = tpool->getThreadNumber();
-    int processor = ThreadPool::getCurrentProcessor();
-    char tmp[100];
-    sprintf( tmp, "%i:  Thread,proc = %i,%i\n", rank, thread, processor );
-    sleep_ms( 10 * rank );
-    print_processor_mutex.lock();
-    pout << tmp;
-    print_processor_mutex.unlock();
-    sleep_ms( 100 );
-}
-
-
-// Function to test how a member thread interacts with the thread pool
-int test_member_thread( ThreadPool *tpool )
-{
-    int N_errors = 0;
-    // Member threads are not allowed to wait for the pool to finish
-    try {
-        tpool->wait_pool_finished();
-        N_errors++;
-    } catch ( ... ) {
-    }
-    // Member threads are not allowed to change the size of the pool
-    try {
-        tpool->wait_pool_finished();
-        N_errors++;
-    } catch ( ... ) {
-    }
-    return N_errors;
-}
-
-
-/******************************************************************
- * Test the TPOOL_ADD_WORK macro with variable number of arguments *
- ******************************************************************/
-static int myfun0() { return 0; }
-static int myfun1( int ) { return 1; }
-static int myfun2( int, float ) { return 2; }
-static int myfun3( int, float, double ) { return 3; }
-static int myfun4( int, float, double, char ) { return 4; }
-static int myfun5( int, float, double, char, std::string ) { return 5; }
-static int myfun6( int, float, double, char, std::string, int ) { return 6; }
-static int myfun7( int, float, double, char, std::string, int, int ) { return 7; }
-static int test_function_arguements( ThreadPool *tpool )
-{
-    int N_errors = 0;
-    // Test some basic types of instantiations
-    ThreadPool::thread_id_t id0 = TPOOL_ADD_WORK( tpool, myfun0, ( nullptr ) );
-    ThreadPool::thread_id_t id1 = TPOOL_ADD_WORK( tpool, myfun1, ( (int) 1 ) );
-    ThreadPool::thread_id_t id2 = TPOOL_ADD_WORK( tpool, myfun2, ( (int) 1, (float) 2 ) );
-    ThreadPool::thread_id_t id3 =
-        TPOOL_ADD_WORK( tpool, myfun3, ( (int) 1, (float) 2, (double) 3 ) );
-    ThreadPool::thread_id_t id4 =
-        TPOOL_ADD_WORK( tpool, myfun4, ( (int) 1, (float) 2, (double) 3, (char) 4 ) );
-    ThreadPool::thread_id_t id5 = TPOOL_ADD_WORK(
-        tpool, myfun5, ( (int) 1, (float) 2, (double) 3, (char) 4, std::string( "test" ) ) );
-    ThreadPool::thread_id_t id52 = TPOOL_ADD_WORK(
-        tpool, myfun5, ( (int) 1, (float) 2, (double) 3, (char) 4, std::string( "test" ) ), -1 );
-    ThreadPool::thread_id_t id6 = TPOOL_ADD_WORK( tpool, myfun6,
-        ( (int) 1, (float) 2, (double) 3, (char) 4, std::string( "test" ), (int) 1 ) );
-    ThreadPool::thread_id_t id7 = TPOOL_ADD_WORK( tpool, myfun7,
-        ( (int) 1, (float) 2, (double) 3, (char) 4, std::string( "test" ), (int) 1, (int) 1 ) );
-    tpool->wait_pool_finished();
-    if ( !tpool->isFinished( id0 ) ) {
-        N_errors++;
-    }
-    if ( tpool->getFunctionRet<int>( id0 ) != 0 ) {
-        N_errors++;
-    }
-    if ( tpool->getFunctionRet<int>( id1 ) != 1 ) {
-        N_errors++;
-    }
-    if ( tpool->getFunctionRet<int>( id2 ) != 2 ) {
-        N_errors++;
-    }
-    if ( tpool->getFunctionRet<int>( id3 ) != 3 ) {
-        N_errors++;
-    }
-    if ( tpool->getFunctionRet<int>( id4 ) != 4 ) {
-        N_errors++;
-    }
-    if ( tpool->getFunctionRet<int>( id5 ) != 5 ) {
-        N_errors++;
-    }
-    if ( tpool->getFunctionRet<int>( id52 ) != 5 ) {
-        N_errors++;
-    }
-    if ( tpool->getFunctionRet<int>( id6 ) != 6 ) {
-        N_errors++;
-    }
-    if ( tpool->getFunctionRet<int>( id7 ) != 7 ) {
-        N_errors++;
-    }
-    return N_errors;
-}
-
-
-/******************************************************************
- * Examples to derive a user work item                             *
- ******************************************************************/
-class UserWorkItemVoid : public ThreadPool::WorkItem
-{
-public:
-    // User defined constructor (does not need to match any interfaces)
-    explicit UserWorkItemVoid( int dummy )
-    {
-        // User initialized variables
-        NULL_USE( dummy );
-    }
-    // User defined run (can do anything)
-    void run() override
-    {
-        // Perform the tasks
-        printf( "Hello work from UserWorkItem (void)" );
-    }
-    // Will the routine return a result
-    bool has_result() const override { return false; }
-    // User defined destructor
-    ~UserWorkItemVoid() override = default;
-};
-class UserWorkItemInt : public ThreadPool::WorkItemRet<int>
-{
-public:
-    // User defined constructor (does not need to match any interfaces)
-    explicit UserWorkItemInt( int dummy )
-    {
-        // User initialized variables
-        NULL_USE( dummy );
-    }
-    // User defined run (can do anything)
-    void run() override
-    {
-        // Perform the tasks
-        printf( "Hello work from UserWorkItem (int)" );
-        // Store the results (it's type will match the template)
-        ThreadPool::WorkItemRet<int>::d_result = 1;
-    }
-    // User defined destructor
-    ~UserWorkItemInt() override = default;
-};
-
-
-/******************************************************************
- * test the time to run N tasks in parallel                        *
- ******************************************************************/
-template<class Ret, class... Args>
-inline double launchAndTime( ThreadPool &tpool, int N, Ret ( *routine )( Args... ), Args... args )
-{
-    tpool.wait_pool_finished();
-    auto start = std::chrono::high_resolution_clock::now();
-    for ( int i = 0; i < N; i++ )
-        ThreadPool_add_work( &tpool, 0, routine, args... );
-    tpool.wait_pool_finished();
-    auto stop = std::chrono::high_resolution_clock::now();
-    return std::chrono::duration<double>( stop - start ).count();
-}
-
-
-// Move constructor function
-volatile ThreadPool::thread_id_t f1( volatile ThreadPool::thread_id_t a ) { return a; }
-ThreadPool::thread_id_t f2( ThreadPool::thread_id_t a ) { return a; }
-
-
-/******************************************************************
- * Test the basic functionallity of the atomics                    *
- ******************************************************************/
-int test_atomics()
-{
-    using namespace AtomicOperations;
-    int N_errors = 0;
-    volatile int32_atomic i32;
-    volatile int64_atomic i64;
-    i32 = 32;
-    i64 = 64;
-    if ( atomic_increment( &i32 ) != 33 || atomic_increment( &i64 ) != 65 )
-        N_errors++;
-    if ( atomic_decrement( &i32 ) != 32 || atomic_decrement( &i64 ) != 64 )
-        N_errors++;
-    if ( atomic_add( &i32, 2 ) != 34 || atomic_add( &i64, 4 ) != 68 )
-        N_errors++;
-    if ( atomic_compare_and_swap( &i32, 0, 0 ) || atomic_compare_and_swap( &i64, 0, 0 ) )
-        N_errors++;
-    if ( !atomic_compare_and_swap( &i32, 34, 32 ) || !atomic_compare_and_swap( &i64, 68, 64 ) )
-        N_errors++;
-    if ( i32 != 32 || i64 != 64 )
-        N_errors++;
-    return N_errors;
-}
-
-
-/******************************************************************
- * Test FIFO behavior                                              *
- ******************************************************************/
-void test_FIFO( UnitTest &ut, ThreadPool &tpool )
-{
-    int rank    = getRank();
-    int size    = getSize();
-    const int N = 4000;
-    for ( int r = 0; r < size; r++ ) {
-        barrier();
-        if ( r != rank )
-            continue;
-        std::vector<ThreadPool::thread_id_t> ids;
-        ids.reserve( N );
-        for ( size_t i = 0; i < N; i++ )
-            ids.emplace_back( TPOOL_ADD_WORK( &tpool, sleep_inc2, ( 0.001 ) ) );
-        bool pass = true;
-        while ( tpool.N_queued() > 0 ) {
-            int i1 = -1, i2 = ids.size();
-            for ( int i = N - 1; i >= 0; i-- ) {
-                bool started = ids[i].started();
-                if ( started )
-                    i1 = std::max<int>( i1, i ); // Last index to processing item
-                else
-                    i2 = std::min<int>( i2, i ); // First index to queued item
-            }
-            int diff = i1 == -1 ? 0 : ( i2 - i1 - 1 );
-            if ( abs( diff ) > 4 ) {
-                printf( "%i %i %i\n", i1, i2, diff );
-                pass = pass && abs( i2 - i1 - 1 ) <= 2;
-            }
-        }
-        ids.clear();
-        tpool.wait_pool_finished();
-        if ( pass )
-            ut.passes( "Thread pool behaves as FIFO" );
-        else
-            ut.failure( "Thread pool does not behave as FIFO" );
-    }
-}
-
-
-/******************************************************************
- * The main program                                                *
- ******************************************************************/
-#ifdef USE_WINDOWS
-int __cdecl main( int argc, char **argv )
-{
-#elif defined( USE_LINUX ) || defined( USE_MAC )
-int main( int argc, char *argv[] )
-{
-#else
-#error Unknown OS
-#endif
-
-    int N_threads = 4;    // Number of threads
-    int N_work    = 2000; // Number of work items
-    int N_it      = 10;   // Number of cycles to run
-    int N_problem = 5;    // Problem size
-    PROFILE_ENABLE( 3 );
-    PROFILE_ENABLE_TRACE();
-    PROFILE_DISABLE_MEMORY();
-    UnitTest ut;
-
-
-    // Initialize MPI and set the error handlers
-#ifdef USE_MPI
-    int provided_thread_support = -1;
-    MPI_Init_thread( &argc, &argv, MPI_THREAD_MULTIPLE, &provided_thread_support );
-    Utilities::setErrorHandlers();
-    // Disable OS specific warnings for all non-root ranks
-#endif
-    int rank = getRank();
-    int size = getSize();
-    if ( rank > 0 )
-        ThreadPool::set_OS_warnings( 1 );
-    NULL_USE( size );
-    NULL_USE( argc );
-    NULL_USE( argv );
-
-
-    // Test the atomics
-    if ( test_atomics() == 0 )
-        ut.passes( "Atomics passed" );
-    else
-        ut.failure( "Atomics failed" );
-
-    // Initialize the data
-    std::vector<int> data1( N_work, 0 );
-    std::vector<int> priority( N_work, 0 );
-    for ( int i = 0; i < N_work; i++ ) {
-        data1[i]    = N_problem;
-        priority[i] = i % 128;
-    }
-
-
-    // Print the size of the thread pool class
-    printp( "Size of ThreadPool = %i\n", (int) sizeof( ThreadPool ) );
-
-
-    // Get the number of processors availible
-    barrier();
-    int N_procs = ThreadPool::getNumberOfProcessors();
-    if ( N_procs > 0 )
-        ut.passes( "getNumberOfProcessors" );
-    else
-        ut.failure( "getNumberOfProcessors" );
-    printp( "%i processors availible\n", N_procs );
-
-
-    // Get the processor affinities for the process
-    barrier();
-    std::vector<int> cpus = ThreadPool::getProcessAffinity();
-    printp( "%i cpus for current process: ", (int) cpus.size() );
-    for ( int cpu : cpus )
-        printp( "%i ", cpu );
-    printp( "\n" );
-    if ( !cpus.empty() ) {
-        ut.passes( "getProcessAffinity" );
-    } else {
-#ifdef __APPLE__
-        ut.expected_failure( "getProcessAffinity" );
-#else
-        ut.failure( "getProcessAffinity" );
-#endif
-    }
-
-
-    // Test setting the process affinities
-    barrier();
-    bool pass = false;
-    if ( !cpus.empty() && N_procs > 0 ) {
-        if ( cpus.size() == 1 ) {
-            cpus.resize( N_procs );
-            for ( int i = 0; i < N_procs; i++ )
-                cpus.push_back( i );
-            try {
-                ThreadPool::setProcessAffinity( cpus );
-            } catch ( ... ) {
-            }
-            cpus                  = ThreadPool::getProcessAffinity();
-            std::vector<int> cpus = ThreadPool::getProcessAffinity();
-            printp( "%i cpus for current process (updated): ", (int) cpus.size() );
-            for ( int cpu : cpus )
-                printp( "%i ", cpu );
-            printp( "\n" );
-            pass = cpus.size() > 1;
-        } else {
-            std::vector<int> cpus_orig = cpus;
-            std::vector<int> cpus_tmp( 1, cpus[0] );
-            try {
-                ThreadPool::setProcessAffinity( cpus_tmp );
-            } catch ( ... ) {
-            }
-            cpus = ThreadPool::getProcessAffinity();
-            if ( cpus.size() == 1 )
-                pass = true;
-            try {
-                ThreadPool::setProcessAffinity( cpus_orig );
-            } catch ( ... ) {
-            }
-            cpus = ThreadPool::getProcessAffinity();
-            if ( cpus.size() != cpus_orig.size() )
-                pass = false;
-        }
-    }
-    if ( pass ) {
-        ut.passes( "setProcessAffinity" );
-    } else {
-#ifdef __APPLE__
-        ut.expected_failure( "setProcessAffinity" );
-#else
-        ut.failure( "setProcessAffinity" );
-#endif
-    }
-    int N_procs_used = std::min<int>( N_procs, N_threads );
-    printp( "%i processors used\n", N_procs_used );
-
-
-    // Create the thread pool
-    barrier();
-    printp( "Creating thread pool\n" );
-    ThreadPool tpool0;
-    ThreadPool tpool;
-    ThreadPool::thread_id_t id;
-    id = TPOOL_ADD_WORK( &tpool, waste_cpu, ( data1[0] ) );
-    if ( id == ThreadPool::thread_id_t() || !tpool.isValid( id ) )
-        ut.failure( "Errors with id" );
-    tpool.setNumThreads( N_threads );
-    if ( tpool.getNumThreads() == N_threads )
-        ut.passes( "Created thread pool" );
-    else
-        ut.failure( "Failed to create tpool with desired number of threads" );
-
-
-    // Test setting the thread affinities
-    barrier();
-    if ( cpus.size() > 1 ) {
-        sleep_ms( 50 );
-        // First make sure we can get the thread affinities
-        std::vector<int> procs = ThreadPool::getThreadAffinity();
-        if ( procs == cpus ) {
-            ut.passes( "getThreadAffinity() matches procs" );
-        } else {
-            char msg[100];
-            sprintf( msg, "getThreadAffinity() does not match procs (%i,%i)",
-                static_cast<int>( procs.size() ), static_cast<int>( cpus.size() ) );
-            ut.failure( msg );
-        }
-        pass = true;
-        for ( int i = 0; i < N_threads; i++ ) {
-            std::vector<int> procs_thread = tpool.getThreadAffinity( i );
-            if ( procs_thread != procs ) {
-                printp( "%i: Initial thread affinity: ", rank );
-                for ( int i : procs_thread )
-                    printp( "%i ", i );
-                printp( "\n" );
-                pass = false;
-            }
-        }
-        if ( pass )
-            ut.passes( "getThreadAffinity(thread) matches procs" );
-        else
-            ut.failure( "getThreadAffinity(thread) does not match procs" );
-        // Try to set the thread affinities
-        pass = true;
-        if ( !procs.empty() ) {
-            int N_procs_thread = std::max<int>( (int) cpus.size() / N_threads, 1 );
-            for ( int i = 0; i < N_threads; i++ ) {
-                std::vector<int> procs_thread( N_procs_thread, -1 );
-                for ( int j = 0; j < N_procs_thread; j++ )
-                    procs_thread[j] = procs[( i * N_procs_thread + j ) % procs.size()];
-                tpool.setThreadAffinity( i, procs_thread );
-                sleep_ms( 10 ); // Give time for OS to update thread affinities
-                std::vector<int> procs_thread2 = tpool.getThreadAffinity( i );
-                if ( procs_thread2 != procs_thread ) {
-                    printp( "%i: Final thread affinity: ", rank );
-                    for ( int i : procs_thread )
-                        printp( "%i ", i );
-                    printp( "\n" );
-                    pass = false;
-                }
-            }
-        }
-        if ( pass )
-            ut.passes( "setThreadAffinity passes" );
-        else
-            ut.failure( "setThreadAffinity failed to change affinity" );
-    }
-
-
-    // Reset the thread affinities
-    barrier();
-    tpool.setNumThreads( tpool.getNumThreads(), "none" );
-    // tpool.setNumThreads(tpool.getNumThreads(),"independent");
-    for ( int i = 0; i < N_threads; i++ ) {
-        std::vector<int> procs_thread = tpool.getThreadAffinity( i );
-        printp( "Thread affinity: " );
-        for ( int i : procs_thread )
-            printp( "%i ", i );
-        printp( "\n" );
-    }
-
-    // Print the current processors by thread id
-    barrier();
-    ThreadPool::set_OS_warnings( 1 );
-    print_processor( &tpool );
-    launchAndTime( tpool, N_threads, print_processor, &tpool );
-
-    // Run some basic tests
-    barrier();
-    auto start = std::chrono::high_resolution_clock::now();
-    for ( int n = 0; n < N_it; n++ ) {
-        for ( int i = 0; i < N_work; i++ )
-            waste_cpu( data1[i] );
-    }
-    auto stop   = std::chrono::high_resolution_clock::now();
-    double time = std::chrono::duration<double>( stop - start ).count();
-    printp( "Time for serial cycle = %0.0f us\n", 1e6 * time / N_it );
-    printp( "Time for serial item = %0.0f ns\n", 1e9 * time / ( N_it * N_work ) );
-    id = TPOOL_ADD_WORK( &tpool, waste_cpu, ( data1[0] ) );
-    tpool.wait( id );
-    std::vector<ThreadPool::thread_id_t> ids2;
-    ids2.push_back( TPOOL_ADD_WORK( &tpool, waste_cpu, ( data1[0] ) ) );
-    tpool.wait( ids2[0] );
-
-    // Test the move operator for thread_id
-    ThreadPool::thread_id_t id1          = f1( id );         // move-construct from rvalue temporary
-    ThreadPool::thread_id_t id2          = std::move( id1 ); // move-construct from xvalue
-    volatile ThreadPool::thread_id_t id3 = f2( id );         // move-construct from rvalue temporary
-    volatile ThreadPool::thread_id_t id4 = std::move( id3 ); // move-construct from xvalue
-    id2.reset();
-    id4.reset();
-
-    // Test calling functions with different number of arguments
-    barrier();
-    printp( "Testing arguments:\n" );
-    int N_errors_args = test_function_arguements( &tpool );
-    if ( N_errors_args == 0 )
-        ut.passes( "Calling function with default arguments" );
-    else
-        ut.failure( "Error calling function with default arguments" );
-
-
-    // Check that the threads can sleep in parallel (this does not depend on the number of
-    // processors)
-    barrier();
-    tpool.wait_pool_finished();
-    start = std::chrono::high_resolution_clock::now();
-    sleep_inc( 1 );
-    stop                  = std::chrono::high_resolution_clock::now();
-    double sleep_serial   = std::chrono::duration<double>( stop - start ).count();
-    double sleep_parallel = launchAndTime( tpool, N_threads, sleep_inc, 1 );
-    double sleep_speedup  = N_procs_used * sleep_serial / sleep_parallel;
-    printf( "%i:  Speedup on %i sleeping threads: %0.3f\n", rank, N_procs_used, sleep_speedup );
-    printf( "%i:    ts = %0.3f, tp = %0.3f\n", rank, sleep_serial, sleep_parallel );
-    if ( fabs( sleep_serial - 1.0 ) < 0.05 && fabs( sleep_parallel - 1.0 ) < 0.25 &&
-         sleep_speedup > 3 )
-        ut.passes( "Passed thread sleep" );
-    else
-        ut.failure( "Failed thread sleep" );
-
-
-    // Check that the threads are actually working in parallel
-    barrier();
-    if ( N_procs_used > 1 ) {
-#ifdef USE_MPI
-        // Use a non-blocking serialization of the MPI processes
-        // if we do not have a sufficient number of processors
-        bool serialize_mpi = N_procs < N_threads * size;
-        int buf;
-        MPI_Request request;
-        MPI_Status status;
-        if ( serialize_mpi && rank > 0 ) {
-            MPI_Irecv( &buf, 1, MPI_INT, rank - 1, 0, MPI_COMM_WORLD, &request );
-            int flag = false;
-            while ( !flag ) {
-                MPI_Test( &request, &flag, &status );
-                sleep_s( 1 );
-            }
-        }
-#endif
-        int N = 20000000; // Enough work to keep the processor busy for ~ 1 s
-        // Run in serial
-        start = std::chrono::high_resolution_clock::now();
-        waste_cpu( N );
-        stop               = std::chrono::high_resolution_clock::now();
-        double time_serial = std::chrono::duration<double>( stop - start ).count();
-        // Run in parallel
-        double time_parallel  = launchAndTime( tpool, N_procs_used, waste_cpu, N );
-        double time_parallel2 = launchAndTime( tpool, N_procs_used, waste_cpu, N / 1000 );
-        double speedup        = N_procs_used * time_serial / time_parallel;
-        printf( "%i:  Speedup on %i procs: %0.3f\n", rank, N_procs_used, speedup );
-        printf( "%i:    ts = %0.3f, tp = %0.3f, tp2 = %0.3f\n", rank, time_serial, time_parallel,
-            time_parallel2 );
-        if ( speedup > 1.4 ) {
-            ut.passes( "Passed speedup test" );
-        } else {
-#ifdef USE_GCOV
-            ut.expected_failure( "Times do not indicate tests are running in parallel (gcov)" );
-#else
-            ut.failure( "Times do not indicate tests are running in parallel" );
-#endif
-        }
-#ifdef USE_MPI
-        if ( serialize_mpi ) {
-            if ( rank < size - 1 )
-                MPI_Send( &N, 1, MPI_INT, rank + 1, 0, MPI_COMM_WORLD );
-            if ( rank == size - 1 ) {
-                for ( int i = 0; i < size - 1; i++ )
-                    MPI_Send( &N, 1, MPI_INT, i, 1, MPI_COMM_WORLD );
-            } else {
-                MPI_Irecv( &buf, 1, MPI_INT, size - 1, 1, MPI_COMM_WORLD, &request );
-                int flag = false;
-                MPI_Status status;
-                while ( !flag ) {
-                    MPI_Test( &request, &flag, &status );
-                    sleep_s( 1 );
-                }
-            }
-        }
-#endif
-    } else {
-        ut.expected_failure( "Testing thread performance with less than 1 processor" );
-    }
-
-
-    // Test first-in-first-out scheduler (also ensures priorities)
-    test_FIFO( ut, tpool );
-
-
-    // Test adding a work item with a dependency
-    barrier();
-    {
-        // Test that we sucessfully wait on the work items
-        std::vector<ThreadPool::thread_id_t> ids;
-        ids.reserve( 5 );
-        global_sleep_count = 0; // Reset the count before this test
-        ThreadPool::thread_id_t id0;
-        auto id1    = TPOOL_ADD_WORK( &tpool, sleep_inc, ( 1 ) );
-        auto id2    = TPOOL_ADD_WORK( &tpool, sleep_inc, ( 2 ) );
-        auto *wait1 = new WorkItemFull<bool, int>( check_inc, 1 );
-        auto *wait2 = new WorkItemFull<bool, int>( check_inc, 2 );
-        wait1->add_dependency( id0 );
-        wait1->add_dependency( id1 );
-        wait2->add_dependency( id1 );
-        wait2->add_dependency( id2 );
-        ids.clear();
-        ids.push_back( tpool.add_work( wait1 ) );
-        ids.push_back( tpool.add_work( wait2 ) );
-        tpool.wait_all( ids.size(), &ids[0] );
-        if ( !tpool.getFunctionRet<bool>( ids[0] ) || !tpool.getFunctionRet<bool>( ids[1] ) )
-            ut.failure( "Failed to wait on required dependency" );
-        else
-            ut.passes( "Dependencies" );
-        tpool.wait_pool_finished();
-        // Test waiting on more dependencies than in the thread pool (changing priorities)
-        ids.clear();
-        for ( size_t i = 0; i < 20; i++ )
-            ids.push_back( TPOOL_ADD_WORK( &tpool, sleep_inc2, ( 0.1 ) ) );
-        auto *wait3 = new WorkItemFull<void, double>( sleep_inc2, 0 );
-        wait3->add_dependencies( ids );
-        id = tpool.add_work( wait3, 50 );
-        tpool.wait( id );
-        bool pass = true;
-        for ( auto &id : ids )
-            pass = pass && id.finished();
-        ids.clear();
-        if ( pass )
-            ut.passes( "Dependencies2" );
-        else
-            ut.failure( "Dependencies2" );
-        // Check that we can handle more complex dependencies
-        id1 = TPOOL_ADD_WORK( &tpool, sleep_inc2, ( 0.5 ) );
-        for ( int i = 0; i < 10; i++ ) {
-            wait1 = new WorkItemFull<bool, int>( check_inc, 1 );
-            wait1->add_dependency( id1 );
-            tpool.add_work( wait1 );
-        }
-        tpool.wait_pool_finished();
-        ids.clear();
-        for ( int i = 0; i < 5; i++ )
-            ids.push_back( TPOOL_ADD_WORK( &tpool, sleep_inc2, ( 0.5 ) ) );
-        sleep_inc2( 0.002 );
-        ThreadPool::WorkItem *work = new WorkItemFull<void, int>( waste_cpu, 100 );
-        work->add_dependencies( ids );
-        id = tpool.add_work( work, 10 );
-        tpool.wait( id );
-    }
-
-    // Test the timing creating and running a work item
-    barrier();
-    {
-        printp( "Testing timmings (creating/running work item):\n" );
-        std::string timer_name = "Create/Run work item";
-        PROFILE_START( timer_name );
-        int64_t time_create = 0;
-        int64_t time_run    = 0;
-        int64_t time_delete = 0;
-        std::vector<ThreadPool::WorkItem *> work( N_work );
-        start = std::chrono::high_resolution_clock::now();
-        for ( int n = 0; n < N_it; n++ ) {
-            auto t1 = std::chrono::high_resolution_clock::now();
-            for ( int i = 0; i < N_work; i++ )
-                work[i] = ThreadPool::createWork<void, int>( waste_cpu, data1[i] );
-            auto t2 = std::chrono::high_resolution_clock::now();
-            for ( int i = 0; i < N_work; i++ )
-                work[i]->run();
-            auto t3 = std::chrono::high_resolution_clock::now();
-            for ( int i = 0; i < N_work; i++ )
-                delete work[i];
-            auto t4 = std::chrono::high_resolution_clock::now();
-            time_create += to_ns( t2 - t1 );
-            time_run += to_ns( t3 - t2 );
-            time_delete += to_ns( t4 - t3 );
-            if ( ( n + 1 ) % 100 == 0 )
-                printp( "Cycle %i of %i finished\n", n + 1, N_it );
-        }
-        stop = std::chrono::high_resolution_clock::now();
-        time = std::chrono::duration<double>( stop - start ).count();
-        PROFILE_STOP( timer_name );
-        printp( "   time = %0.0f ms\n", 1e3 * time );
-        printp( "   time / cycle = %0.0f us\n", 1e6 * time / N_it );
-        printp( "   average time / item = %0.0f ns\n", 1e9 * time / ( N_it * N_work ) );
-        printp( "      create = %i ns\n", time_create / ( N_it * N_work ) );
-        printp( "      run    = %i ns\n", time_run / ( N_it * N_work ) );
-        printp( "      delete = %i us\n", time_delete / ( N_it * N_work ) );
-    }
-
-    // Test the timing adding a single item
-    barrier();
-    for ( int it = 0; it < 2; it++ ) {
-        ThreadPool *tpool_ptr = nullptr;
-        std::string timer_name;
-        if ( it == 0 ) {
-            printp( "Testing timmings (adding a single item to empty tpool):\n" );
-            timer_name = "Add single item to empty pool";
-            tpool_ptr  = &tpool0;
-        } else if ( it == 1 ) {
-            printp( "Testing timmings (adding a single item):\n" );
-            timer_name = "Add single item to tpool";
-            tpool_ptr  = &tpool;
-        }
-        PROFILE_START( timer_name );
-        std::vector<ThreadPool::thread_id_t> ids( N_work );
-        int64_t time_add  = 0;
-        int64_t time_wait = 0;
-        start             = std::chrono::high_resolution_clock::now();
-        for ( int n = 0; n < N_it; n++ ) {
-            auto t1 = std::chrono::high_resolution_clock::now();
-            for ( int i = 0; i < N_work; i++ )
-                ids[i] = TPOOL_ADD_WORK( tpool_ptr, waste_cpu, ( data1[i] ), priority[i] );
-            auto t2 = std::chrono::high_resolution_clock::now();
-            tpool_ptr->wait_all( N_work, &ids[0] );
-            auto t3 = std::chrono::high_resolution_clock::now();
-            time_add += to_ns( t2 - t1 );
-            time_wait += to_ns( t3 - t2 );
-            if ( ( n + 1 ) % 100 == 0 )
-                printp( "Cycle %i of %i finished\n", n + 1, N_it );
-        }
-        stop = std::chrono::high_resolution_clock::now();
-        time = std::chrono::duration<double>( stop - start ).count();
-        PROFILE_STOP( timer_name );
-        printp( "   time = %0.0f ms\n", 1e3 * time );
-        printp( "   time / cycle = %0.0f us\n", 1e6 * time / N_it );
-        printp( "   average time / item = %0.0f ns\n", 1e9 * time / ( N_it * N_work ) );
-        printp( "      create and add = %i ns\n", time_add / ( N_it * N_work ) );
-        printp( "      wait = %i us\n", time_wait / ( N_it * N_work ) );
-    }
-
-    // Test the timing pre-creating the work items and adding multiple at a time
-    barrier();
-    for ( int it = 0; it < 2; it++ ) {
-        ThreadPool *tpool_ptr = nullptr;
-        std::string timer_name;
-        if ( it == 0 ) {
-            printp( "Testing timmings (adding a block of items to empty tpool):\n" );
-            timer_name = "Add multiple items to empty pool";
-            tpool_ptr  = &tpool0;
-        } else if ( it == 1 ) {
-            printp( "Testing timmings (adding a block of items):\n" );
-            timer_name = "Add multiple items to tpool";
-            tpool_ptr  = &tpool;
-        }
-        PROFILE_START( timer_name );
-        int64_t time_create_work = 0;
-        int64_t time_add_work    = 0;
-        int64_t time_wait_work   = 0;
-        std::vector<ThreadPool::WorkItem *> work( N_work );
-        start = std::chrono::high_resolution_clock::now();
-        for ( int n = 0; n < N_it; n++ ) {
-            auto t1 = std::chrono::high_resolution_clock::now();
-            for ( int i = 0; i < N_work; i++ )
-                work[i] = ThreadPool::createWork<void, int>( waste_cpu, data1[i] );
-            auto t2  = std::chrono::high_resolution_clock::now();
-            auto ids = tpool_ptr->add_work( work, priority );
-            auto t3  = std::chrono::high_resolution_clock::now();
-            tpool_ptr->wait_all( ids );
-            auto t4 = std::chrono::high_resolution_clock::now();
-            time_create_work += to_ns( t2 - t1 );
-            time_add_work += to_ns( t3 - t2 );
-            time_wait_work += to_ns( t4 - t3 );
-            if ( ( n + 1 ) % 100 == 0 )
-                printp( "Cycle %i of %i finished\n", n + 1, N_it );
-        }
-        stop = std::chrono::high_resolution_clock::now();
-        time = std::chrono::duration<double>( stop - start ).count();
-        PROFILE_STOP( timer_name );
-        printp( "   time = %0.0f ms\n", 1e3 * time );
-        printp( "   time / cycle = %0.0f us\n", 1e6 * time / N_it );
-        printp( "   average time / item = %0.0f ns\n", 1e9 * time / ( N_it * N_work ) );
-        printp( "      create = %i ns\n", time_create_work / ( N_it * N_work ) );
-        printp( "      add = %i ns\n", time_add_work / ( N_it * N_work ) );
-        printp( "      wait = %i ns\n", time_wait_work / ( N_it * N_work ) );
-    }
-
-    // Run a dependency test that tests a simple case that should keep the thread pool busy
-    // Note: Checking the results requires looking at the trace data
-    tpool.wait_pool_finished();
-    PROFILE_START( "Dependency test" );
-    for ( int i = 0; i < 10; i++ ) {
-        char msg[3][100];
-        sprintf( msg[0], "Item %i-%i", i, 0 );
-        sprintf( msg[1], "Item %i-%i", i, 1 );
-        sprintf( msg[2], "Item %i-%i", i, 2 );
-        ThreadPool::WorkItem *work =
-            new WorkItemFull<void, double, std::string>( sleep_msg, 0.5, msg[0] );
-        ThreadPool::WorkItem *work1 =
-            new WorkItemFull<void, double, std::string>( sleep_msg, 0.1, msg[1] );
-        ThreadPool::WorkItem *work2 =
-            new WorkItemFull<void, double, std::string>( sleep_msg, 0.1, msg[2] );
-        ThreadPool::thread_id_t id = tpool.add_work( work );
-        work1->add_dependency( id );
-        work2->add_dependency( id );
-        tpool.add_work( work1 );
-        tpool.add_work( work2 );
-    }
-    tpool.wait_pool_finished();
-    PROFILE_STOP( "Dependency test" );
-
-    // Close the thread pool
-    tpool.setNumThreads( 0 );
-
-    // Save the profiling results
-    PROFILE_SAVE( "test_thread_pool" );
-    PROFILE_DISABLE();
-
-    // Test creating/destroying a thread pool using new
-    barrier();
-    pass = true;
-    try {
-        ThreadPool *tpool = new ThreadPool( ThreadPool::MAX_NUM_THREADS - 1 );
-        if ( tpool->getNumThreads() != ThreadPool::MAX_NUM_THREADS - 1 )
-            pass = false;
-        if ( !ThreadPool::is_valid( tpool ) )
-            pass = false;
-        delete tpool;
-        // Check that tpool is invalid
-        // Note: valgrind will report this as an invalid memory read, but we want to keep the test)
-        if ( ThreadPool::is_valid( tpool ) )
-            pass = false;
-    } catch ( ... ) {
-        pass = false;
-    }
-    if ( pass )
-        ut.passes( "Created/destroyed thread pool with new" );
-    else
-        ut.failure( "Created/destroyed thread pool with new" );
-
-    // Print the test results
-    barrier();
-    ut.report();
-    auto N_errors = static_cast<int>( ut.NumFailGlobal() );
-
-    // Shudown MPI
-    pout << "Shutting down\n";
-    barrier();
-#ifdef USE_TIMER
-    if ( rank == 0 )
-        MemoryApp::print( pout );
-#endif
-#ifdef USE_MPI
-    MPI_Finalize();
-    sleep_ms( 10 );
-#endif
-    return N_errors;
-}
diff --git a/threadpool/thread_pool.cpp b/threadpool/thread_pool.cpp
index 4cf7e222..837909cb 100644
--- a/threadpool/thread_pool.cpp
+++ b/threadpool/thread_pool.cpp
@@ -1,8 +1,10 @@
-#define _CRT_NONSTDC_NO_DEPRECATE
 #include "threadpool/thread_pool.h"
 #include "common/Utilities.h"
-#include "common/StackTrace.h"
+#include "StackTrace/StackTrace.h"
+#include "StackTrace/Utilities.h"
+
 #include "ProfilerApp.h"
+
 #include <algorithm>
 #include <bitset>
 #include <chrono>
@@ -10,11 +12,17 @@
 #include <cstdio>
 #include <cstdlib>
 #include <iostream>
+#include <random>
 #include <stdexcept>
 #include <thread>
 #include <typeinfo>
 
 
+// Add profile timers or performance counters to the threadpool
+#define PROFILE_THREADPOOL_PERFORMANCE 0
+#define MONITOR_THREADPOOL_PERFORMANCE 0
+
+
 #define perr std::cerr
 #define pout std::cout
 #define printp printf
@@ -34,7 +42,6 @@
 #if defined( USE_WINDOWS )
     #include <process.h>
     #include <windows.h>
-    #define NOMINMAX
     // Disable warning: the inline specifier cannot be used when a friend
     // declaration refers to a specialization of a function template
     #pragma warning( disable : 4396 )
@@ -62,30 +69,23 @@
 
 
 // Set some macros
-#if PROFILE_THREADPOOL_PERFORMANCE
-#define PROFILE_THREADPOOL_START( X ) PROFILE_START( X, 3 )
-#define PROFILE_THREADPOOL_START2( X ) PROFILE_START2( X, 3 )
-#define PROFILE_THREADPOOL_STOP( X ) PROFILE_STOP( X, 3 )
-#define PROFILE_THREADPOOL_STOP2( X ) PROFILE_STOP2( X, 3 )
+// clang-format off
+#if PROFILE_THREADPOOL_PERFORMANCE == 1
+#define PROFILE_THREADPOOL_START(X)  PROFILE_START(X,3)
+#define PROFILE_THREADPOOL_START2(X) PROFILE_START2(X,3)
+#define PROFILE_THREADPOOL_STOP(X)   PROFILE_STOP(X,3)
+#define PROFILE_THREADPOOL_STOP2(X)  PROFILE_STOP2(X,3)
 #else
-#define PROFILE_THREADPOOL_START( X ) \
-    do {                              \
-    } while ( 0 )
-#define PROFILE_THREADPOOL_START2( X ) \
-    do {                               \
-    } while ( 0 )
-#define PROFILE_THREADPOOL_STOP( X ) \
-    do {                             \
-    } while ( 0 )
-#define PROFILE_THREADPOOL_STOP2( X ) \
-    do {                              \
-    } while ( 0 )
+#define PROFILE_THREADPOOL_START(X)  do {} while ( 0 )
+#define PROFILE_THREADPOOL_START2(X) do {} while ( 0 )
+#define PROFILE_THREADPOOL_STOP(X)   do {} while ( 0 )
+#define PROFILE_THREADPOOL_STOP2(X)  do {} while ( 0 )
 #endif
 #if MONITOR_THREADPOOL_PERFORMANCE == 1
-#define accumulate( x, t1, t2 )   \
-    AtomicOperations::atomic_add( \
-        &x, std::chrono::duration_cast<std::chrono::nanoseconds>( t2 - t1 ).count() );
+#define accumulate(x,t1,t2)  AtomicOperations::atomic_add( \
+    &x, std::chrono::duration_cast<std::chrono::nanoseconds>(t2-t1).count() );
 #endif
+// clang-format on
 
 
 #if MONITOR_THREADPOOL_PERFORMANCE == 1
@@ -93,37 +93,59 @@ static AtomicOperations::int64_atomic total_add_work_time[5] = { 0, 0, 0, 0, 0 }
 #endif
 
 
+// Set env
+static std::mutex Utilities_mutex;
+void setenv( const std::string &name, const std::string &value )
+{
+    Utilities_mutex.lock();
+#if defined( USE_LINUX ) || defined( USE_MAC )
+    bool pass = false;
+    if ( value.empty() )
+        pass = ::setenv( name.data(), value.data(), 1 ) == 0;
+    else
+        pass = ::unsetenv( name.data() ) == 0;
+#elif defined( USE_WINDOWS )
+    bool pass = SetEnvironmentVariable( name.data(), value.data() ) != 0;
+#else
+#error Unknown OS
+#endif
+    Utilities_mutex.unlock();
+    if ( !pass ) {
+        char msg[1024];
+        if ( !value.empty() )
+            sprintf(
+                msg, "Error setting enviornmental variable: %s=%s\n", name.data(), value.data() );
+        else
+            sprintf( msg, "Error clearing enviornmental variable: %s\n", name.data() );
+        ERROR( msg );
+    }
+}
+
+
 // Helper functions
 template<class T>
 void quicksort( int N, T *data );
 template<class T>
 inline void quicksort( std::vector<T> &x )
 {
-    quicksort( (int) x.size(), x.data() );
+    quicksort( x.size(), x.data() );
 }
 static inline int find_id( int, const ThreadPool::thread_id_t *, const ThreadPool::thread_id_t & );
 
 
-// Function to generate a random size_t number (excluding 0 and ~0)
-static size_t rand_size_t()
+// Function to generate a random number for checking if tpool is valid
+static inline bool validHeadTail( uint32_t key )
 {
-    size_t key = 0;
-    double tmp = 1;
-    if ( sizeof( size_t ) == 4 ) {
-        while ( tmp < 4e9 ) {
-            key ^= rand() * 0x9E3779B9; // 2^32*0.5*(sqrt(5)-1)
-            tmp *= RAND_MAX;
-        }
-    } else if ( sizeof( size_t ) == 8 ) {
-        while ( tmp < 1.8e19 ) {
-            key ^= rand() * 0x9E3779B97F4A7C15; // 2^64*0.5*(sqrt(5)-1)
-            tmp *= RAND_MAX;
-        }
-    } else {
-        throw std::logic_error( "Unhandled case" );
-    }
-    if ( key == 0 || ( ~key ) == 0 )
-        key = rand_size_t();
+    return ( key > 10 ) && ( ~key > 10 ) && ( key % 2 != 0 ) && ( key % 3 == 2 );
+}
+static inline uint32_t generateHeadTail()
+{
+    uint32_t key = 0;
+    std::random_device rd;
+    std::mt19937 gen( rd() );
+    std::uniform_int_distribution<> dis( 1, 0xFFFFFF );
+    while ( !validHeadTail( key ) )
+        key = static_cast<uint32_t>( dis( gen ) ) * 0x9E3779B9; // 2^32*0.5*(sqrt(5)-1)
     return key;
 }
 
@@ -131,22 +153,10 @@ static size_t rand_size_t()
 /******************************************************************
  * Run some basic compile-time checks                              *
  ******************************************************************/
-#if MAX_NUM_THREADS % 64 != 0
-// We use a bit array for d_active and d_cancel
-#error MAX_NUM_THREADS must be a multiple of 64
-#endif
-#if MAX_NUM_THREADS >= 65535
-// We store N_threads as a short int
-#error MAX_NUM_THREADS must < 65535
-#endif
-#if MAX_QUEUED >= 65535
-// We store the indicies to the queue list as short ints
-#error MAX_QUEUED must < 65535
-#endif
-// Check the c++ std
-#if CXX_STD == 98
-#error Thread pool class requires c++11 or newer
-#endif
+static_assert( ThreadPool::MAX_THREADS % 64 == 0, "MAX_THREADS must be a multiple of 64" );
+static_assert( ThreadPool::MAX_THREADS < 65535, "MAX_THREADS must < 65535" );
+static_assert( sizeof( AtomicOperations::int32_atomic ) == 4, "atomic32 must be a 32-bit integer" );
+static_assert( sizeof( AtomicOperations::int64_atomic ) == 8, "atomic64 must be a 64-bit integer" );
 
 
 /******************************************************************
@@ -181,7 +191,7 @@ static inline bool get_bit( const volatile AtomicOperations::int64_atomic *x, si
     uint64_t mask = 0x01;
     mask <<= index % 64;
     // This is thread-safe since we only care about a single bit
-    AtomicOperations::int64_atomic y = x[index / 64]; 
+    AtomicOperations::int64_atomic y = x[index / 64];
     return ( y & mask ) != 0;
 }
 
@@ -214,18 +224,15 @@ static inline int count_bits( int_type x )
 /******************************************************************
  * Set the global constants                                        *
  ******************************************************************/
-constexpr int ThreadPool::MAX_NUM_THREADS;
-constexpr int ThreadPool::MAX_QUEUED;
-constexpr int ThreadPool::MAX_WAIT;
-constexpr bool ThreadPool::PROFILE_THREADPOOL_PERFORMANCE;
-constexpr bool ThreadPool::MONITOR_THREADPOOL_PERFORMANCE;
+constexpr uint16_t ThreadPool::MAX_THREADS;
+constexpr uint16_t ThreadPool::MAX_WAIT;
 
 
 /******************************************************************
  * Set the behavior of OS warnings                                 *
  ******************************************************************/
 static int global_OS_behavior = 0;
-std::mutex OS_warning_mutex;
+static std::mutex OS_warning_mutex;
 void ThreadPool::set_OS_warnings( int behavior )
 {
     ASSERT( behavior >= 0 && behavior <= 2 );
@@ -249,18 +256,7 @@ void ThreadPool::setErrorHandler( std::function<void( const std::string & )> fun
 /******************************************************************
  * Function to return the number of processors availible           *
  ******************************************************************/
-int ThreadPool::getNumberOfProcessors()
-{
-#if defined( USE_LINUX ) || defined( USE_MAC )
-    return sysconf( _SC_NPROCESSORS_ONLN );
-#elif defined( USE_WINDOWS )
-    SYSTEM_INFO sysinfo;
-    GetSystemInfo( &sysinfo );
-    return static_cast<int>( sysinfo.dwNumberOfProcessors );
-#else
-#error Unknown OS
-#endif
-}
+int ThreadPool::getNumberOfProcessors() { return std::thread::hardware_concurrency(); }
 
 
 /******************************************************************
@@ -293,19 +289,17 @@ std::vector<int> ThreadPool::getProcessAffinity()
     int error = sched_getaffinity( getpid(), sizeof( cpu_set_t ), &mask );
     if ( error != 0 )
         throw std::logic_error( "Error getting process affinity" );
-    for ( int i = 0; i < (int) sizeof( cpu_set_t ) * CHAR_BIT; i++ ) {
+    for ( size_t i = 0; i < sizeof( cpu_set_t ) * CHAR_BIT; i++ ) {
         if ( CPU_ISSET( i, &mask ) )
             procs.push_back( i );
     }
 #else
 #warning sched_getaffinity is not supported for this compiler/OS
     OS_warning( "sched_getaffinity is not supported for this compiler/OS" );
-    procs.clear();
 #endif
 #elif defined( USE_MAC )
     // MAC does not support getting or setting the affinity
     OS_warning( "MAC does not support getting the process affinity" );
-    procs.clear();
 #elif defined( USE_WINDOWS )
     HANDLE hProc = GetCurrentProcess();
     size_t procMask;
@@ -313,7 +307,7 @@ std::vector<int> ThreadPool::getProcessAffinity()
     PDWORD_PTR procMaskPtr = reinterpret_cast<PDWORD_PTR>( &procMask );
     PDWORD_PTR sysMaskPtr  = reinterpret_cast<PDWORD_PTR>( &sysMask );
     GetProcessAffinityMask( hProc, procMaskPtr, sysMaskPtr );
-    for ( int i = 0; i < (int) sizeof( size_t ) * CHAR_BIT; i++ ) {
+    for ( size_t i = 0; i < sizeof( size_t ) * CHAR_BIT; i++ ) {
         if ( ( procMask & 0x1 ) != 0 )
             procs.push_back( i );
         procMask >>= 1;
@@ -323,7 +317,7 @@ std::vector<int> ThreadPool::getProcessAffinity()
 #endif
     return procs;
 }
-void ThreadPool::setProcessAffinity( std::vector<int> procs )
+void ThreadPool::setProcessAffinity( const std::vector<int> &procs )
 {
 #ifdef USE_LINUX
 #ifdef _GNU_SOURCE
@@ -337,12 +331,10 @@ void ThreadPool::setProcessAffinity( std::vector<int> procs )
 #else
 #warning sched_setaffinity is not supported for this compiler/OS
     OS_warning( "sched_setaffinity is not supported for this compiler/OS" );
-    procs.clear();
 #endif
 #elif defined( USE_MAC )
     // MAC does not support getting or setting the affinity
     OS_warning( "MAC does not support setting the process affinity" );
-    procs.clear();
 #elif defined( USE_WINDOWS )
     DWORD mask = 0;
     for ( size_t i = 0; i < procs.size(); i++ )
@@ -365,7 +357,7 @@ DWORD GetThreadAffinityMask( HANDLE thread )
     DWORD old  = 0;
     // try every CPU one by one until one works or none are left
     while ( mask ) {
-        old = static_cast<DWORD>( SetThreadAffinityMask( thread, mask ) );
+        old = SetThreadAffinityMask( thread, mask );
         if ( old ) {                              // this one worked
             SetThreadAffinityMask( thread, old ); // restore original
             return old;
@@ -375,7 +367,6 @@ DWORD GetThreadAffinityMask( HANDLE thread )
         }
         mask <<= 1;
     }
-
     return 0;
 }
 #endif
@@ -388,22 +379,20 @@ std::vector<int> ThreadPool::getThreadAffinity()
     int error = pthread_getaffinity_np( pthread_self(), sizeof( cpu_set_t ), &mask );
     if ( error != 0 )
         throw std::logic_error( "Error getting thread affinity" );
-    for ( int i = 0; i < (int) sizeof( cpu_set_t ) * CHAR_BIT; i++ ) {
+    for ( size_t i = 0; i < sizeof( cpu_set_t ) * CHAR_BIT; i++ ) {
         if ( CPU_ISSET( i, &mask ) )
             procs.push_back( i );
     }
 #else
 #warning pthread_getaffinity_np is not supported
     OS_warning( "pthread does not support pthread_getaffinity_np" );
-    procs.clear();
 #endif
 #elif defined( USE_MAC )
     // MAC does not support getting or setting the affinity
     OS_warning( "MAC does not support getting the thread affinity" );
-    procs.clear();
 #elif defined( USE_WINDOWS )
     size_t procMask = GetThreadAffinityMask( GetCurrentThread() );
-    for ( int i = 0; i < (int) sizeof( size_t ) * CHAR_BIT; i++ ) {
+    for ( size_t i = 0; i < sizeof( size_t ) * CHAR_BIT; i++ ) {
         if ( ( procMask & 0x1 ) != 0 )
             procs.push_back( i );
         procMask >>= 1;
@@ -418,30 +407,28 @@ std::vector<int> ThreadPool::getThreadAffinity( int thread ) const
     if ( thread >= getNumThreads() )
         std::logic_error( "Invalid thread number" );
     std::vector<int> procs;
-    auto handle = const_cast<std::thread &>( d_thread[thread] ).native_handle();
 #ifdef USE_LINUX
 #ifdef _GNU_SOURCE
+    auto handle = const_cast<std::thread &>( d_thread[thread] ).native_handle();
     cpu_set_t mask;
     int error = pthread_getaffinity_np( handle, sizeof( cpu_set_t ), &mask );
     if ( error != 0 )
         throw std::logic_error( "Error getting thread affinity" );
-    for ( int i = 0; i < (int) sizeof( cpu_set_t ) * CHAR_BIT; i++ ) {
+    for ( size_t i = 0; i < sizeof( cpu_set_t ) * CHAR_BIT; i++ ) {
         if ( CPU_ISSET( i, &mask ) )
             procs.push_back( i );
     }
 #else
 #warning pthread_getaffinity_np is not supported
     OS_warning( "pthread does not support pthread_getaffinity_np" );
-    procs.clear();
 #endif
 #elif defined( USE_MAC )
     // MAC does not support getting or setting the affinity
-    NULL_USE( handle );
     OS_warning( "MAC does not support getting the thread affinity" );
-    procs.clear();
 #elif defined( USE_WINDOWS )
+    auto handle     = const_cast<std::thread &>( d_thread[thread] ).native_handle();
     size_t procMask = GetThreadAffinityMask( handle );
-    for ( int i = 0; i < (int) sizeof( size_t ) * CHAR_BIT; i++ ) {
+    for ( size_t i = 0; i < sizeof( size_t ) * CHAR_BIT; i++ ) {
         if ( ( procMask & 0x1 ) != 0 )
             procs.push_back( i );
         procMask >>= 1;
@@ -456,7 +443,7 @@ std::vector<int> ThreadPool::getThreadAffinity( int thread ) const
 /******************************************************************
  * Function to set the thread affinity                             *
  ******************************************************************/
-void ThreadPool::setThreadAffinity( std::vector<int> procs )
+void ThreadPool::setThreadAffinity( const std::vector<int> &procs )
 {
 #ifdef USE_LINUX
 #ifdef _GNU_SOURCE
@@ -470,7 +457,6 @@ void ThreadPool::setThreadAffinity( std::vector<int> procs )
 #else
 #warning pthread_getaffinity_np is not supported
     OS_warning( "pthread does not support pthread_setaffinity_np" );
-    procs.clear();
 #endif
 #elif defined( USE_MAC )
     // MAC does not support getting or setting the affinity
@@ -485,34 +471,33 @@ void ThreadPool::setThreadAffinity( std::vector<int> procs )
 #error Unknown OS
 #endif
 }
-void ThreadPool::setThreadAffinity( int thread, std::vector<int> procs ) const
+void ThreadPool::setThreadAffinity( int thread, const std::vector<int> &procs ) const
 {
     if ( thread >= getNumThreads() )
         std::logic_error( "Invalid thread number" );
-    auto handle = const_cast<std::thread &>( d_thread[thread] ).native_handle();
 #ifdef USE_LINUX
 #ifdef __USE_GNU
     cpu_set_t mask;
     CPU_ZERO( &mask );
     for ( size_t i = 0; i < procs.size(); i++ )
         CPU_SET( procs[i], &mask );
-    int error = pthread_setaffinity_np( handle, sizeof( cpu_set_t ), &mask );
+    auto handle = const_cast<std::thread &>( d_thread[thread] ).native_handle();
+    int error   = pthread_setaffinity_np( handle, sizeof( cpu_set_t ), &mask );
     if ( error != 0 )
         throw std::logic_error( "Error setting thread affinity" );
 #else
 #warning pthread_getaffinity_np is not supported
     OS_warning( "pthread does not support pthread_setaffinity_np" );
-    procs.clear();
 #endif
 #elif defined( USE_MAC )
     // MAC does not support getting or setting the affinity
-    NULL_USE( handle );
     NULL_USE( procs );
     OS_warning( "MAC does not support getting the process affinity" );
 #elif defined( USE_WINDOWS )
     DWORD mask = 0;
     for ( size_t i = 0; i < procs.size(); i++ )
         mask |= ( (DWORD) 1 ) << procs[i];
+    auto handle = const_cast<std::thread &>( d_thread[thread] ).native_handle();
     SetThreadAffinityMask( handle, mask );
 #else
 #error Unknown OS
@@ -523,22 +508,10 @@ void ThreadPool::setThreadAffinity( int thread, std::vector<int> procs ) const
 /******************************************************************
  * Function to perform some basic checks before we start           *
  ******************************************************************/
-void ThreadPool::check_startup( size_t size0 )
+void ThreadPool::check_startup()
 {
-    // Check the size of the class to make sure that we don't have any
-    // byte alignment problems between a library implimentation and a calling pacakge
-    size_t size1 = sizeof( ThreadPool );
-    size_t size2 = ( (size_t) &d_NULL_HEAD ) - ( (size_t) this ) + sizeof( size_t );
-    size_t size3 = ( (size_t) &d_NULL_TAIL ) - ( (size_t) this ) + sizeof( size_t );
-    if ( size0 != size1 || size1 < size2 || size1 < size3 )
-        throw std::logic_error( "Internal data format problem" );
-    // Check the size of variables
-    if ( sizeof( AtomicOperations::int32_atomic ) != 4 )
-        throw std::logic_error( "AtomicOperations::int32_atomic is not 32 bits" );
-    if ( sizeof( AtomicOperations::int64_atomic ) != 8 )
-        throw std::logic_error( "AtomicOperations::int32_atomic is not 64 bits" );
     // Check getting/setting a bit
-    atomic_64 x[2] = { 0x0, 0x7 };
+    AtomicOperations::int64_atomic x[2] = { 0x0, 0x7 };
     set_bit( x, 2 );
     unset_bit( x, 66 );
     if ( x[0] != 4 || x[1] != 3 || !get_bit( x, 2 ) || get_bit( x, 66 ) )
@@ -578,17 +551,21 @@ void ThreadPool::check_startup( size_t size0 )
     if ( isValid( id ) || !isValid( id2 ) )
         pass = false;
     if ( !pass )
-        throw std::logic_error( "Thread pool failed to initialize" );
+        throw std::logic_error( "thread id test failed" );
 }
 
 
 /******************************************************************
- * Function to initialize the thread pool                          *
+ * Constructors/destructor                                         *
  ******************************************************************/
-void ThreadPool::initialize( const int N, const char *affinity, int N_procs, const int *procs )
+ThreadPool::ThreadPool(
+    const int N, const std::string &affinity, const std::vector<int> &procs, int queueSize )
+    : d_queue_list( queueSize )
 {
+    // Run some basic tests on startup
+    check_startup();
     // Initialize the header/tail
-    d_NULL_HEAD = rand_size_t();
+    d_NULL_HEAD = generateHeadTail();
     d_NULL_TAIL = d_NULL_HEAD;
     // Initialize the variables to NULL values
     d_id_assign     = 0;
@@ -600,31 +577,31 @@ void ThreadPool::initialize( const int N, const char *affinity, int N_procs, con
     d_N_started     = 0;
     d_N_finished    = 0;
     d_max_wait_time = 600;
-    memset( (void *) d_active, 0, MAX_NUM_THREADS / 8 );
-    memset( (void *) d_cancel, 0, MAX_NUM_THREADS / 8 );
+    memset( (void *) d_active, 0, MAX_THREADS / 8 );
+    memset( (void *) d_cancel, 0, MAX_THREADS / 8 );
     d_wait_last = nullptr;
     for ( auto &i : d_wait )
         i = nullptr;
     // Initialize the id
     d_id_assign = thread_id_t::maxThreadID;
     // Create the threads
-    setNumThreads( N, affinity, N_procs, procs );
+    setNumThreads( N, affinity, procs );
+    // Verify that the threadpool is valid
+    if ( !is_valid( this ) )
+        throw std::logic_error( "Thread pool is not valid" );
 }
-
-
-/******************************************************************
- * This is the de-constructor                                      *
- ******************************************************************/
 ThreadPool::~ThreadPool()
 {
     DISABLE_WARNINGS
-    if ( !is_valid( this ) )
-        throw std::logic_error( "Thread pool is not valid" );
+    if ( !is_valid( this ) ) {
+        std::cerr << "Thread pool is not valid, error calling destructor\n";
+        return;
+    }
     ENABLE_WARNINGS
     // Destroy the threads
     setNumThreads( 0 );
     // Delete all remaining data
-    d_N_threads = -1;
+    d_N_threads = ~0;
     d_NULL_HEAD = 0;
     d_NULL_TAIL = 0;
     delete d_wait_last;
@@ -645,9 +622,9 @@ bool ThreadPool::is_valid( const ThreadPool *tpool )
 {
     if ( tpool == nullptr )
         return false;
-    if ( tpool->d_N_threads < 0 || tpool->d_N_threads > MAX_NUM_THREADS )
+    if ( tpool->d_N_threads > MAX_THREADS )
         return false;
-    if ( tpool->d_NULL_HEAD == 0 || tpool->d_NULL_HEAD != tpool->d_NULL_TAIL )
+    if ( !validHeadTail( tpool->d_NULL_HEAD ) || tpool->d_NULL_HEAD != tpool->d_NULL_TAIL )
         return false;
     return true;
 }
@@ -657,17 +634,17 @@ bool ThreadPool::is_valid( const ThreadPool *tpool )
  * This function creates the threads in the thread pool            *
  ******************************************************************/
 void ThreadPool::setNumThreads(
-    int num_worker_threads, const char *affinity2, int N_procs, const int *procs )
+    int num_worker_threads, const std::string &affinity, const std::vector<int> &procs )
 {
     // Check if we are a member thread
     if ( isMemberThread() )
         throw std::logic_error(
             "Member threads are not allowed to change the number of threads in the pool" );
     // Determing the number of threads we need to create or destroy
-    if ( num_worker_threads > MAX_NUM_THREADS ) {
-        printp( "Warning: Maximum Number of Threads is %i\n", MAX_NUM_THREADS );
+    if ( num_worker_threads > MAX_THREADS ) {
+        printp( "Warning: Maximum Number of Threads is %i\n", MAX_THREADS );
         printp( "         Only that number will be created\n" );
-        num_worker_threads = MAX_NUM_THREADS;
+        num_worker_threads = MAX_THREADS;
     } else if ( num_worker_threads < 0 ) {
         printp( "Error: cannot have a negitive number of threads\n" );
         printp( "       Setting the number of threads to 0\n" );
@@ -681,23 +658,10 @@ void ThreadPool::setNumThreads(
                 throw std::logic_error(
                     "Threads are being created and destroyed at the same time" );
         }
-// Create the thread attributes (linux only)
-#if defined( USE_LINUX ) || defined( USE_MAC )
-        pthread_attr_t attr;
-        pthread_attr_init( &attr );
-// int ptmp;
-// pthread_attr_setstacksize(&attr,2097152);     // Default stack size is 8MB
-// pthread_attr_setschedpolicy(&attr,1);
-// pthread_attr_getschedpolicy(&attr,&ptmp);
-// pout << "getschedpolicy = " << ptmp << std::endl;
-#endif
         // Create the threads
-        auto tmp = new void *[2 * d_N_threads_diff];
-        int j    = d_N_threads;
+        int j = d_N_threads;
         for ( int i = 0; i < d_N_threads_diff; i++ ) {
             d_N_threads++;
-            tmp[0 + 2 * i] = this;
-            tmp[1 + 2 * i] = reinterpret_cast<void *>( static_cast<size_t>( j ) );
             set_bit( d_cancel, j );
             d_thread[j] = std::thread( create_new_thread, this, j );
             j++;
@@ -713,12 +677,7 @@ void ThreadPool::setNumThreads(
             if ( !wait )
                 break;
         }
-// Delete the thread attributes (linux only)
-#if defined( USE_LINUX ) || defined( USE_MAC )
-        pthread_attr_destroy( &attr );
-#endif
         std::this_thread::sleep_for( std::chrono::milliseconds( 25 ) );
-        delete[] tmp;
     } else if ( d_N_threads_diff < 0 ) {
         // Reduce the number of threads
         if ( num_worker_threads == 0 ) {
@@ -752,15 +711,14 @@ void ThreadPool::setNumThreads(
     } catch ( ... ) {
         pout << "Warning: Unable to get default cpus for thread affinities\n";
     }
-    if ( !cpus.empty() && N_procs > 0 ) {
-        cpus.resize( N_procs );
-        for ( int i = 0; i < N_procs; i++ )
+    if ( !cpus.empty() && !procs.empty() ) {
+        cpus.resize( procs.size() );
+        for ( size_t i = 0; i < procs.size(); i++ )
             cpus[i] = procs[i];
     }
     // Set the affinity model and the associated thread affinities
     // Note: not all OS's support setting the thread affinities
     std::vector<std::vector<int>> t_procs( d_N_threads );
-    std::string affinity( affinity2 );
     if ( cpus.empty() ) {
         // We do not have a list of cpus to use, do nothing (OS not supported)
     } else if ( affinity == "none" ) {
@@ -769,13 +727,13 @@ void ThreadPool::setNumThreads(
             t_procs[i] = cpus;
     } else if ( affinity == "independent" ) {
         // We want to use an independent set of processors for each thread
-        if ( (int) cpus.size() == d_N_threads ) {
+        if ( cpus.size() == d_N_threads ) {
             // The number of cpus matches the number of threads
             for ( int i = 0; i < d_N_threads; i++ )
                 t_procs[i] = std::vector<int>( 1, cpus[i] );
-        } else if ( (int) cpus.size() > d_N_threads ) {
+        } else if ( cpus.size() > d_N_threads ) {
             // There are more cpus than threads, threads will use more the one processor
-            int N_procs_thread = static_cast<int>( cpus.size() + d_N_threads - 1 ) / d_N_threads;
+            int N_procs_thread = ( cpus.size() + d_N_threads - 1 ) / d_N_threads;
             size_t k           = 0;
             for ( int i = 0; i < d_N_threads; i++ ) {
                 for ( int j = 0; j < N_procs_thread && k < cpus.size(); j++ ) {
@@ -785,8 +743,7 @@ void ThreadPool::setNumThreads(
             }
         } else {
             // There are fewer cpus than threads, threads will share a processor
-            auto N_threads_proc =
-                static_cast<int>( ( cpus.size() + d_N_threads - 1 ) / cpus.size() );
+            auto N_threads_proc = ( cpus.size() + d_N_threads - 1 ) / cpus.size();
             for ( int i = 0; i < d_N_threads; i++ )
                 t_procs[i].push_back( cpus[i / N_threads_proc] );
         }
@@ -797,7 +754,7 @@ void ThreadPool::setNumThreads(
     try {
         for ( int i = 0; i < d_N_threads; i++ ) {
             ThreadPool::setThreadAffinity( i, t_procs[i] );
-            std::vector<int> cpus2 = getThreadAffinity( i );
+            auto cpus2 = getThreadAffinity( i );
             if ( cpus2 != t_procs[i] )
                 pout << "Warning: error setting affinities (failed to set)\n";
         }
@@ -823,12 +780,14 @@ void ThreadPool::tpool_thread( int thread_id )
     AtomicOperations::atomic_increment( &d_num_active );
     set_bit( d_active, thread_id );
     unset_bit( d_cancel, thread_id );
+    setenv( "OMP_NUM_THREADS", "1" );
+    setenv( "MKL_NUM_THREADS", "1" );
     if ( printInfo ) {
         // Print the pid
         printp( "pid = %i\n", (int) getpid() );
         // Print the processor affinities for the process
         try {
-            std::vector<int> cpus = ThreadPool::getProcessAffinity();
+            auto cpus = ThreadPool::getProcessAffinity();
             printp( "%i cpus for current thread: ", (int) cpus.size() );
             for ( int cpu : cpus )
                 printp( "%i ", cpu );
@@ -842,7 +801,7 @@ void ThreadPool::tpool_thread( int thread_id )
     shutdown = false;
     while ( !shutdown ) {
         // Check if there is work to do
-        if ( d_queue_list.size() > 0 ) {
+        if ( !d_queue_list.empty() ) {
             // Get next work item to process
             auto work_id =
                 d_queue_list.remove( []( const thread_id_t &id ) { return id.ready(); } );
@@ -890,6 +849,8 @@ void ThreadPool::tpool_thread( int thread_id )
         } else {
             int N_active = AtomicOperations::atomic_decrement( &d_num_active );
             unset_bit( d_active, thread_id );
+            // Yield to give the main thread a chance to update
+            std::this_thread::yield();
             // Alert main thread that a thread finished processing
             if ( ( N_active == 0 ) && d_signal_empty ) {
                 d_wait_finished.notify_all();
@@ -897,7 +858,9 @@ void ThreadPool::tpool_thread( int thread_id )
             }
             // Wait for work
             PROFILE_THREADPOOL_STOP2( "thread active" );
-            d_wait_work.wait_for( 1e-3 );
+            double wait_time = thread_id <= 2 ? 0.01 : 0.1;
+            if ( d_queue_list.empty() )
+                d_wait_work.wait_for( wait_time );
             PROFILE_THREADPOOL_START2( "thread active" );
             AtomicOperations::atomic_increment( &d_num_active );
             set_bit( d_active, thread_id );
@@ -921,13 +884,13 @@ inline void ThreadPool::add_work( const ThreadPool::thread_id_t &id )
     auto work     = id.work();
     work->d_state = 1;
     // Check and change priorities of dependency ids
-    const int priority = id.getPriority();
+    int priority = id.getPriority();
+    auto compare = []( const thread_id_t &a, const thread_id_t &b ) { return a == b; };
     for ( int i = 0; i < work->d_N_ids; i++ ) {
         const auto &id1 = work->d_ids[i];
         if ( !id1.started() && id1 < id ) {
             // Remove and add the id back with a higher priority
-            auto id2 = d_queue_list.remove(
-                []( const thread_id_t &a, const thread_id_t &b ) { return a == b; }, id1 );
+            auto id2 = d_queue_list.remove( compare, id1 );
             id2.setPriority( std::max( priority, id2.getPriority() ) );
             d_queue_list.insert( id2 );
         }
@@ -939,7 +902,7 @@ void ThreadPool::add_work(
     size_t N, ThreadPool::WorkItem *work[], const int *priority, ThreadPool::thread_id_t *ids )
 {
     // If we have a very long list, break it up into smaller pieces to keep the threads busy
-    const size_t block_size = MAX_QUEUED / 8;
+    constexpr size_t block_size = 256;
     if ( N > block_size ) {
         size_t i = 0;
         while ( i < N ) {
@@ -949,13 +912,13 @@ void ThreadPool::add_work(
         return;
     }
     PROFILE_THREADPOOL_START( "add_work" );
-#if MONITOR_THREADPOOL_PERFORMANCE
+#if MONITOR_THREADPOOL_PERFORMANCE == 1
     auto t1 = std::chrono::high_resolution_clock::now();
 #endif
     // Create the thread ids (can be done without blocking)
     for ( size_t i = 0; i < N; i++ )
         ids[i].reset( priority[i], AtomicOperations::atomic_decrement( &d_id_assign ), work[i] );
-#if MONITOR_THREADPOOL_PERFORMANCE
+#if MONITOR_THREADPOOL_PERFORMANCE == 1
     auto t2 = std::chrono::high_resolution_clock::now();
     accumulate( total_add_work_time[0], t1, t2 );
 #endif
@@ -966,7 +929,7 @@ void ThreadPool::add_work(
             work[i]->run();
             work[i]->d_state = 3;
         }
-#if MONITOR_THREADPOOL_PERFORMANCE
+#if MONITOR_THREADPOOL_PERFORMANCE == 1
         auto t5 = std::chrono::high_resolution_clock::now();
         accumulate( total_add_work_time[4], t2, t5 );
 #endif
@@ -974,29 +937,29 @@ void ThreadPool::add_work(
         return;
     }
     // Wait for enough room in the queue (doesn't need blocking since it isn't that precise)
-    if ( N > static_cast<size_t>( MAX_QUEUED - d_queue_list.size() ) ) {
-        auto N_wait = static_cast<int>( N - ( MAX_QUEUED - d_queue_list.size() ) );
+    if ( N > d_queue_list.capacity() - d_queue_list.size() ) {
+        int N_wait = N - ( d_queue_list.capacity() - d_queue_list.size() );
         while ( N_wait > 0 ) {
-            d_signal_count = static_cast<unsigned char>( std::min( N_wait, 255 ) );
+            d_signal_count = std::min<int>( N_wait, 255 );
             d_wait_finished.wait_for( 1e-4 );
-            N_wait = static_cast<int>( N - ( MAX_QUEUED - d_queue_list.size() ) );
+            N_wait = N - ( d_queue_list.capacity() - d_queue_list.size() );
         }
     }
-#if MONITOR_THREADPOOL_PERFORMANCE
+#if MONITOR_THREADPOOL_PERFORMANCE == 1
     auto t3 = std::chrono::high_resolution_clock::now();
     accumulate( total_add_work_time[1], t2, t3 );
 #endif
     // Get add the work items to the queue
     for ( size_t i = 0; i < N; i++ )
         add_work( ids[i] );
-#if MONITOR_THREADPOOL_PERFORMANCE
+#if MONITOR_THREADPOOL_PERFORMANCE == 1
     auto t4 = std::chrono::high_resolution_clock::now();
     accumulate( total_add_work_time[2], t3, t4 );
 #endif
     // Activate sleeping threads
     if ( d_num_active == d_N_threads ) {
         // All threads are active, no need to wake anybody
-    } else if ( d_queue_list.size() == 0 ) {
+    } else if ( d_queue_list.empty() ) {
         // Queue is empty, no need to activate
     } else if ( N == 1 ) {
         // Added 1 item to the queue, wake 1 worker
@@ -1005,7 +968,7 @@ void ThreadPool::add_work(
         // Added multple items in the queue, wake all workers
         d_wait_work.notify_all();
     }
-#if MONITOR_THREADPOOL_PERFORMANCE
+#if MONITOR_THREADPOOL_PERFORMANCE == 1
     auto t5 = std::chrono::high_resolution_clock::now();
     accumulate( total_add_work_time[3], t4, t5 );
 #endif
@@ -1026,8 +989,8 @@ static inline void check_finished(
         }
     }
 }
-int ThreadPool::wait_some(
-    size_t N_work, const ThreadPool::thread_id_t *ids, size_t N_wait, bool *finished ) const
+int ThreadPool::wait_some( size_t N_work, const ThreadPool::thread_id_t *ids, size_t N_wait,
+    bool *finished, int max_wait ) const
 {
     // Check the inputs
     if ( N_wait > N_work )
@@ -1056,13 +1019,21 @@ int ThreadPool::wait_some(
     auto tmp = new wait_ids_struct( N_work, ids, N_wait, d_cond_pool, MAX_WAIT, d_wait );
     // Wait for the ids
     auto t1 = std::chrono::high_resolution_clock::now();
-    while ( !tmp->wait_for( 0.01 ) ) {
-        check_wait_time( t1 );
+    auto t2 = t1;
+    int dt1 = 0;
+    while ( dt1 < max_wait ) {
+        if ( tmp->wait_for( std::min( max_wait, d_max_wait_time ), 0.01 ) )
+            break;
+        auto t3 = std::chrono::high_resolution_clock::now();
+        dt1     = std::chrono::duration_cast<std::chrono::seconds>( t3 - t1 ).count();
+        int dt2 = std::chrono::duration_cast<std::chrono::seconds>( t3 - t2 ).count();
+        if ( dt2 >= d_max_wait_time ) {
+            print_wait_warning();
+            t2 = t3;
+        }
     }
     // Update the ids that have finished
     check_finished( N_work, ids, N_finished, finished );
-    if ( N_finished < N_wait && N_work != 0 )
-        throw std::logic_error( "Internal error: failed to wait" );
     // Delete the wait event struct
     // Note: we want to maintain the reference in case a thread is still using it
     // Note: technically this should be atomic, but it really isn't necessary here
@@ -1075,40 +1046,43 @@ int ThreadPool::wait_some(
 /******************************************************************
  * This function waits for all of the threads to finish their work *
  ******************************************************************/
-void ThreadPool::check_wait_time(
-    std::chrono::time_point<std::chrono::high_resolution_clock> &t1 ) const
+void ThreadPool::print_wait_warning() const
 {
-    auto t2 = std::chrono::high_resolution_clock::now();
-    if ( std::chrono::duration_cast<std::chrono::seconds>( t2 - t1 ).count() > d_max_wait_time ) {
-        pout << "Warning: Maximum wait time in ThreadPool exceeded, threads may be hung\n";
-        pout << "N_active: " << d_num_active << std::endl;
-        pout << "N_queued: " << d_queue_list.size() << std::endl;
-        pout << "N_added: " << d_N_added << std::endl;
-        pout << "N_started: " << d_N_started << std::endl;
-        pout << "N_finished: " << d_N_finished << std::endl;
-        pout << "queue.insert(): " << d_queue_list.N_insert() << std::endl;
-        pout << "queue.remove(): " << d_queue_list.N_remove() << std::endl;
-        pout << "Stack Trace:\n";
-        auto call_stack = StackTrace::getAllCallStacks();
-        StackTrace::cleanupStackTrace( call_stack );
-        auto text = call_stack.print( "  " );
-        for ( auto &line : text )
-            pout << line << std::endl;
-        t1 = std::chrono::high_resolution_clock::now();
-    }
+    pout << "Warning: Maximum wait time in ThreadPool exceeded, threads may be hung\n";
+    pout << "N_active: " << d_num_active << std::endl;
+    pout << "N_queued: " << d_queue_list.size() << std::endl;
+    pout << "N_added: " << d_N_added << std::endl;
+    pout << "N_started: " << d_N_started << std::endl;
+    pout << "N_finished: " << d_N_finished << std::endl;
+    pout << "queue.insert(): " << d_queue_list.N_insert() << std::endl;
+    pout << "queue.remove(): " << d_queue_list.N_remove() << std::endl;
+    pout << "Stack Trace:\n";
+    auto call_stack = StackTrace::getAllCallStacks();
+    StackTrace::cleanupStackTrace( call_stack );
+    auto text = call_stack.print( "  " );
+    for ( auto &line : text )
+        pout << line << std::endl;
 }
 void ThreadPool::wait_pool_finished() const
 {
     // First check that we are not one of the threads
-    if ( isMemberThread() ) {
+    if ( isMemberThread() )
         throw std::logic_error( "Member thread attempted to call wait_pool_finished" );
-    }
     // Wait for all threads to finish their work
     auto t1 = std::chrono::high_resolution_clock::now();
-    while ( d_num_active > 0 || d_queue_list.size() > 0 ) {
-        check_wait_time( t1 );
+    while ( d_num_active > 0 || !d_queue_list.empty() ) {
+        // Wait for signal from last thread
         d_signal_empty = true;
-        d_wait_finished.wait_for( 10e-6 );
+        d_wait_finished.wait_for( 5e-4 );
+        if ( d_num_active == 0 && d_queue_list.empty() )
+            break;
+        // Check that we have not exceeded the maximum time
+        auto t2     = std::chrono::high_resolution_clock::now();
+        int seconds = std::chrono::duration_cast<std::chrono::seconds>( t2 - t1 ).count();
+        if ( seconds > d_max_wait_time ) {
+            print_wait_warning();
+            t1 = t2;
+        }
     }
     d_signal_empty = false;
 }
@@ -1162,30 +1136,46 @@ void ThreadPool::wait_ids_struct::id_finished( const ThreadPool::thread_id_t &id
         }
     }
 }
-bool ThreadPool::wait_ids_struct::wait_for( double seconds )
+inline bool ThreadPool::wait_ids_struct::check()
 {
-    for ( int i = 0; i < d_N; i++ ) {
-        if ( d_ids[i].finished() )
-            d_finished[i] = true;
+    int N_finished = 0;
+    for ( int i = 0; i < d_N; i++ )
+        N_finished += d_finished[i] ? 1 : 0;
+    if ( N_finished >= d_wait || d_N == 0 ) {
+        *d_ptr = nullptr;
+        d_wait = 0;
+        d_N    = 0;
+        return true;
     }
-    auto t1 = std::chrono::high_resolution_clock::now();
-    while ( true ) {
-        int N_finished = 0;
-        for ( int i = 0; i < d_N; i++ )
-            N_finished += d_finished[i] ? 1 : 0;
-        if ( N_finished >= d_wait || d_N == 0 ) {
-            *d_ptr = nullptr;
-            d_wait = 0;
-            d_N    = 0;
-            break;
+    return false;
+}
+bool ThreadPool::wait_ids_struct::wait_for( double total_time, double recheck_time )
+{
+    int total   = 1e6 * total_time;
+    int recheck = 1e6 * recheck_time;
+    auto t1     = std::chrono::high_resolution_clock::now();
+    auto t2     = t1;
+    int us1     = 0;
+    while ( us1 < total ) {
+        for ( int i = 0; i < d_N; i++ ) {
+            if ( d_ids[i].finished() )
+                d_finished[i] = true;
         }
-        auto t2 = std::chrono::high_resolution_clock::now();
-        if ( 1e-6 * std::chrono::duration_cast<std::chrono::microseconds>( t2 - t1 ).count() >
-             seconds )
-            return false;
-        d_wait_event->wait_for( 1e-5 );
+        if ( check() )
+            return true;
+        int us2 = 0;
+        while ( us2 < recheck ) {
+            double dt = 1e-6 * std::max( 10, recheck - us2 );
+            d_wait_event->wait_for( dt );
+            if ( check() )
+                return true;
+            auto t3 = std::chrono::high_resolution_clock::now();
+            us2     = std::chrono::duration_cast<std::chrono::microseconds>( t3 - t2 ).count();
+            t2      = t3;
+        }
+        us1 = std::chrono::duration_cast<std::chrono::microseconds>( t2 - t1 ).count();
     }
-    return true;
+    return false;
 }
 
 
@@ -1298,9 +1288,8 @@ inline int find_id( int n, const ThreadPool::thread_id_t *x, const ThreadPool::t
     // Perform the search
     size_t lower = 0;
     size_t upper = n - 1;
-    size_t index;
     while ( ( upper - lower ) != 1 ) {
-        index = ( upper + lower ) / 2;
+        size_t index = ( upper + lower ) / 2;
         if ( x[index] == id )
             return index;
         if ( x[index] >= id )
@@ -1325,9 +1314,8 @@ void ThreadPool::WorkItem::add_dependencies( size_t N, const ThreadPool::thread_
         throw std::logic_error(
             "Cannot add dependency to work item once it has been added the the threadpool" );
     }
-    if ( static_cast<size_t>( d_N_ids ) + N > 0xFFFF ) {
+    if ( d_N_ids + N > 0xFFFF )
         throw std::logic_error( "Cannot add more than 65000 dependencies" );
-    }
     if ( d_N_ids + N + 1 > d_size ) {
         thread_id_t *tmp = d_ids;
         unsigned int N2  = d_size;
diff --git a/threadpool/thread_pool.h b/threadpool/thread_pool.h
index eff12433..9cb5b21a 100644
--- a/threadpool/thread_pool.h
+++ b/threadpool/thread_pool.h
@@ -5,7 +5,7 @@
 #define included_AtomicModelThreadPool
 
 #include <condition_variable>
-#include <iostream>
+#include <functional>
 #include <map>
 #include <mutex>
 #include <stdarg.h>
@@ -40,7 +40,7 @@
  *       thread_id_t ids[2];
  *       ids[0] = TPOOL_ADD_WORK( tpool, myfun_1, (a,b) );
  *       ids[1] = TPOOL_ADD_WORK( tpool, myfun_2, (c,d) );
- *       int error = wait_all(2,ids);
+ *       wait_all(2,ids);
  *       double x = getFunctionRet(ids[0]);
  *       double y = getFunctionRet(ids[1]); <BR>
  *   </pre>
@@ -49,11 +49,8 @@ class ThreadPool
 {
 public:
     ///// Set some global properties
-    constexpr static int MAX_NUM_THREADS = 128; // The maximum number of threads (must be a multiple of 64)
-    constexpr static int MAX_QUEUED = 1024;     // The maximum number of items in the work queue at any moment
-    constexpr static int MAX_WAIT = 16;         // The maximum number of active waits at any given time
-    constexpr static bool PROFILE_THREADPOOL_PERFORMANCE = false; // Add profile timers to the threadpool
-    constexpr static bool MONITOR_THREADPOOL_PERFORMANCE = false; // Add detailed performance counters
+    constexpr static uint16_t MAX_THREADS = 128; // The maximum number of threads (must be a multiple of 64)
+    constexpr static uint16_t MAX_WAIT = 16;     // The maximum number of active waits at any given time
 
 public:
     ///// Member classes
@@ -117,6 +114,8 @@ public:
         }
         //! Check if thread id is null
         inline bool isNull( ) const { return d_id==nullThreadID; }
+        //! Check if thread id is null
+        inline WorkItem* getWork( ) const { return reinterpret_cast<WorkItem *>( d_work ); }
 
     private:
         // Reset the internal data to the given values
@@ -174,9 +173,8 @@ public:
          */
         inline void add_dependencies( const std::vector<ThreadPool::thread_id_t> &ids )
         {
-            if ( !ids.empty() ) {
+            if ( !ids.empty() )
                 add_dependencies( ids.size(), &ids[0] );
-            }
         }
         /*!
          * \brief Add a list of work item to the list of dependencies
@@ -201,8 +199,8 @@ public:
         WorkItem( const WorkItem & );            // Private copy constructor
         WorkItem &operator=( const WorkItem & ); // Private assignment operator
         volatile char d_state;                   // Current state (0: not added to threadpool, 1: queued, 2: started, 3: finished)
-        short unsigned int d_N_ids;              // Number of dependencies
-        short unsigned int d_size;               // Size of d_ids
+        uint16_t d_N_ids;                        // Number of dependencies
+        uint16_t d_size;                         // Size of d_ids
         AtomicOperations::int32_atomic d_count;  // Count used by a thread_id
         thread_id_t *d_ids;                      // Pointer to id list
         // Friends
@@ -232,7 +230,7 @@ public:
     protected:
         return_type d_result;
     protected:
-        inline WorkItemRet() { }
+        inline WorkItemRet() : d_result( return_type() ) { }
     private:
         WorkItemRet( const WorkItemRet & );            // Private copy constructor
         WorkItemRet &operator=( const WorkItemRet & ); // Private assignment operator
@@ -242,37 +240,17 @@ public:
 public:
     ///// Member functions
 
-    //! Empty constructor
-    ThreadPool()
-    {
-        // Note: we need the constructor in the header to ensure that check_startup
-        //       is able to check for changes in the byte alignment
-        check_startup( sizeof( ThreadPool ) );
-        initialize( 0, "none", 0, nullptr );
-        if ( !is_valid( this ) )
-            throw std::logic_error( "Thread pool is not valid" );
-    }
-
-
     /*!
      *  Constructor that initialize the thread pool with N threads
-     * @param N    The desired number of worker threads
+     * @param N                 The desired number of worker threads
      * @param affinity          The affinity scheduler to use:
      *                          none - Let the OS handle the affinities (default)
      *                          independent - Give each thread an independent set of processors
      * @param procs             The processors to use (defaults to the process affinitiy list)
+     * @param queueSize         The maximum number of items in the queue before forcing a wait
      */
-    ThreadPool( const int N, const std::string &affinity = "none",
-        const std::vector<int> &procs = std::vector<int>() )
-    {
-        // Note: we need the constructor in the header to ensure that check_startup
-        //       is able to check for changes in the byte alignment
-        check_startup( sizeof( ThreadPool ) );
-        const int *procs2 = procs.empty() ? nullptr : ( &procs[0] );
-        initialize( N, affinity.c_str(), (int) procs.size(), procs2 );
-        if ( !is_valid( this ) )
-            throw std::logic_error( "Thread pool is not valid" );
-    }
+    ThreadPool( const int N = 0, const std::string &affinity = "none",
+        const std::vector<int> &procs = std::vector<int>(), int queueSize = 1024 );
 
 
     //! Destructor
@@ -292,7 +270,7 @@ public:
 
 
     //! Function to set the affinity of the current process
-    static void setProcessAffinity( std::vector<int> procs );
+    static void setProcessAffinity( const std::vector<int>& procs );
 
 
     //! Function to return the affinity of the current thread
@@ -310,7 +288,7 @@ public:
      *  Function to set the affinity of the current thread
      *  @param procs    The processors to use
      */
-    static void setThreadAffinity( std::vector<int> procs );
+    static void setThreadAffinity( const std::vector<int>& procs );
 
 
     /*!
@@ -318,11 +296,11 @@ public:
      *  @param thread   The index of the thread
      *  @param procs    The processors to use
      */
-    void setThreadAffinity( int thread, std::vector<int> procs ) const;
+    void setThreadAffinity( int thread, const std::vector<int>& procs ) const;
 
 
     //! Function to return the number of threads in the thread pool
-    int getNumThreads() const { return d_N_threads; }
+    inline int getNumThreads() const { return d_N_threads; }
 
 
     /*!
@@ -332,21 +310,15 @@ public:
      *   in the ThreadPool without checking the existing work unless the desired number of
      *   threads is 0.  In this case, the function will wait for all work items to finish
      *   before deleting the existing work threads.
-
      *   Member threads may not call this function.
      * @param N                 The desired number of worker threads
      * @param affinity          The affinity scheduler to use:
      *                          none - Let the OS handle the affinities (default)
-
      *                          independent - Give each thread an independent set of processors
      * @param procs             The processors to use (defaults to the process affinitiy list)
      */
-    inline void setNumThreads( const int N, const std::string &affinity = "none",
-        const std::vector<int> &procs = std::vector<int>() )
-    {
-        const int *procs2 = procs.empty() ? nullptr : ( &procs[0] );
-        setNumThreads( N, affinity.c_str(), (int) procs.size(), procs2 );
-    }
+    void setNumThreads( const int N, const std::string &affinity = "none",
+        const std::vector<int> &procs = std::vector<int>() );
 
 
     /*!
@@ -394,6 +366,36 @@ public:
     static inline return_type getFunctionRet( const thread_id_t &id );
 
 
+    /*!
+     * \brief   Function to create a work item
+     * \details This function creates a work item that can be added to the queue
+     * @param routine           Function to call from the thread pool
+     * @param args              Function arguments to pass
+     */
+    template<class Ret, class... Args>
+    static inline WorkItem* createWork( std::function<Ret(Args...)> routine, std::tuple<Args...> &&args );
+
+
+    /*!
+     * \brief   Function to create a work item
+     * \details This function creates a work item that can be added to the queue
+     * @param routine           Function to call from the thread pool
+     * @param args              Function arguments to pass
+     */
+    template<class Ret, class... Args>
+    static inline WorkItem* createWork( Ret( *routine )( Args... ), std::tuple<Args...> &&args );
+
+
+    /*!
+     * \brief   Function to create a work item
+     * \details This function creates a work item that can be added to the queue
+     * @param routine           Function to call from the thread pool
+     * @param args              Function arguments to pass
+     */
+    template<class Ret, class... Args>
+    static inline WorkItem* createWork( std::function<Ret(Args...)> routine, Args... args );
+
+
     /*!
      * \brief   Function to create a work item
      * \details This function creates a work item that can be added to the queue
@@ -431,61 +433,33 @@ public:
 
     /*!
      * \brief   Function to wait until a specific work item has finished
-     * \details This is the function waits for a specific work item to finished.  It returns 0 if
-     * successful.
+     * \details This is the function waits for a specific work item to finished.
      *   Note: any thread may call this routine, but they will block until finished.
      *   For worker threads this may eventually lead to a deadlock.
      * @param id                The work item to wait for
      */
-    inline int wait( thread_id_t id ) const;
+    inline void wait( thread_id_t id ) const;
 
 
     /*!
      * \brief   Function to wait until any of the given work items have finished their work
      * \details This is the function waits for any of the given work items to finish.
      *   If successful it returns the index of a finished work item (the index in the array ids).
-     *   If unseccessful it will return -1.
-     *   Note: any thread may call this routine, but they will block until finished.
-     *   For worker threads this may eventually lead to a deadlock.
-     * @param N_work            The number of work items
-     * @param ids               Array of work items to wait for
-     */
-    inline int wait_any( size_t N_work, const thread_id_t *ids );
-
-
-    /*!
-     * \brief   Function to wait until any of the given work items have finished their work
-     * \details This is the function waits for any of the given work items to finish.
-     *   If successful it returns the index of a finished work item (the index in the array ids).
-     *   If unseccessful it will return -1.
      *   Note: any thread may call this routine, but they will block until finished.
      *   For worker threads this may eventually lead to a deadlock.
      * @param ids               Vector of work items to wait for
      */
-    inline int wait_any( const std::vector<thread_id_t> &ids ) const;
+    inline size_t wait_any( const std::vector<thread_id_t> &ids ) const;
 
 
     /*!
      * \brief   Function to wait until all of the given work items have finished their work
-     * \details This is the function waits for all given of the work items to finish.  It returns 0
-     * if successful.
-     *   Note: any thread may call this routine, but they will block until finished.
-     *   For worker threads this may eventually lead to a deadlock.
-     * @param N_work            The number of work items
-     * @param ids               Array of work items to wait for
-     */
-    inline int wait_all( size_t N_work, const thread_id_t *ids ) const;
-
-
-    /*!
-     * \brief   Function to wait until all of the given work items have finished their work
-     * \details This is the function waits for all given of the work items to finish.  It returns 0
-     * if successful.
+     * \details This is the function waits for all given of the work items to finish.
      *   Note: any thread may call this routine, but they will block until finished.
      *   For worker threads this may eventually lead to a deadlock.
      * @param ids               Vector of work items to wait for
      */
-    inline int wait_all( const std::vector<thread_id_t> &ids ) const;
+    inline void wait_all( const std::vector<thread_id_t> &ids ) const;
 
 
     /*!
@@ -496,8 +470,9 @@ public:
      *   For worker threads this may eventually lead to a deadlock.
      * @param N_wait            Number of work items to wait for
      * @param ids               Vector of work items to wait for
+     * @param max_wait          Maximum time to wait (seconds)
      */
-    inline std::vector<int> wait_some( int N_wait, const std::vector<thread_id_t> &ids ) const;
+    inline std::vector<int> wait_some( int N_wait, const std::vector<thread_id_t> &ids, int max_wait = 10000000 ) const;
 
 
     /*!
@@ -584,14 +559,13 @@ public: // Static interface
 
     /*!
      * \brief   Function to wait until all of the given work items have finished their work
-     * \details This is the function waits for all given of the work items to finish.  It returns 0
-     * if successful.
+     * \details This is the function waits for all given of the work items to finish.
      *   Note: any thread may call this routine, but they will block until finished.
      *   For worker threads this may eventually lead to a deadlock.
      * @param tpool         Threadpool containing work (must match call to add_work)
      * @param ids           Vector of work items to wait for
      */
-    static inline int wait_all( const ThreadPool* tpool, const std::vector<thread_id_t> &ids );
+    static inline void wait_all( const ThreadPool* tpool, const std::vector<thread_id_t> &ids );
 
 
     /*!
@@ -604,10 +578,6 @@ public: // Static interface
     static inline void wait_pool_finished( const ThreadPool* tpool ) { if ( tpool ) { tpool->wait_pool_finished(); } }
 
 
-
-private:
-    typedef AtomicOperations::int32_atomic int32_atomic;
-
 private:
     ///// Member data structures
 
@@ -644,11 +614,14 @@ private:
     //    before calling wait
     class wait_ids_struct {
       public:
+        wait_ids_struct() = delete;
+        wait_ids_struct( const wait_ids_struct& ) = delete;
+        wait_ids_struct& operator=( const wait_ids_struct & ) = delete;
         wait_ids_struct( size_t N, const ThreadPool::thread_id_t *ids, size_t N_wait,
             AtomicOperations::pool<condition_variable,128>& cv_pool, int N_wait_list, volatile wait_ids_struct **list );
         ~wait_ids_struct( );
         void id_finished( const ThreadPool::thread_id_t& id ) const;
-        bool wait_for( double seconds );
+        bool wait_for( double total_time, double recheck_time );
       private:
         mutable int d_wait;                     // The number of work items that must finish before we alert the thread
         mutable int d_N;                        // The number of ids we are waiting on
@@ -657,9 +630,8 @@ private:
         condition_variable *d_wait_event;       // Handle to a wait event
         volatile mutable bool *d_finished;      // Has each id finished
         volatile mutable wait_ids_struct **d_ptr;
-        wait_ids_struct();
-        wait_ids_struct( const wait_ids_struct& );
-        wait_ids_struct& operator=( const wait_ids_struct & );
+      private:
+        inline bool check();
     };
 
 
@@ -670,10 +642,8 @@ private:
     ThreadPool( const ThreadPool & );
     ThreadPool &operator=( const ThreadPool & );
 
-    // Function to initialize the thread pool
-    void setNumThreads( int N, const char *affinity, int N_procs, const int *procs );
-    void initialize( int N, const char *affinity, int N_procs, const int *procs );
-    void check_startup( size_t size0 );
+    // Function to check the startup
+    void check_startup( );
 
     // Function to add an array of work items
     void add_work(
@@ -701,39 +671,45 @@ private:
     inline bool isMemberThread() const { return getThreadNumber()>=0; }
 
     // Function to wait for some work items to finish
-    int wait_some( size_t N_work, const thread_id_t *ids, size_t N_wait, bool *finished ) const;
+    int wait_some( size_t N_work, const thread_id_t *ids, size_t N_wait, bool *finished, int max_wait ) const;
     
     // Check if we are waiting too long and pring debug info
-    void check_wait_time( std::chrono::time_point<std::chrono::high_resolution_clock>& t1 ) const;
+    void print_wait_warning( ) const;
+
 
 private:
     ///// Member data
-    typedef AtomicOperations::int64_atomic atomic_64;
-    typedef AtomicList<thread_id_t,MAX_QUEUED,std::greater<thread_id_t>> queue_type;
-    // Note: We want to store the variables in a certain order to optimize storage
-    //   and ensure consistent packing / object size
-    size_t d_NULL_HEAD;                     // Null data buffer to check memory bounds
-    volatile atomic_64 d_id_assign;         // An internal variable used to store the current id to assign
-    volatile mutable bool d_signal_empty;   // Do we want to send a signal when the queue is empty
-    volatile mutable int32_atomic d_signal_count; // Signal count
-    short int d_N_threads;                  // Number of threads
-    volatile int32_atomic d_num_active;     // Number of threads that are currently active
-    volatile atomic_64 d_active[MAX_NUM_THREADS/64]; // Which threads are currently active
-    volatile atomic_64 d_cancel[MAX_NUM_THREADS/64]; // Which threads should be deleted
-    volatile atomic_64 d_N_added;           // Number of items added to the work queue
-    volatile atomic_64 d_N_started;         // Number of items started
-    volatile atomic_64 d_N_finished;        // Number of items finished
-    volatile mutable wait_ids_struct *d_wait[MAX_WAIT]; // The wait events to check
-    mutable wait_ids_struct *d_wait_last;   // A cached copy of the last completed wait event (in case a thread still has a reference)
-    condition_variable d_wait_finished;     // Condition variable to signal when all work is finished
-    condition_variable d_wait_work;         // Condition variable to signal when there is new work
-    mutable AtomicOperations::pool<condition_variable,128> d_cond_pool;
-    std::thread d_thread[MAX_NUM_THREADS];  // Handles to the threads
-    std::thread::id d_threadId[MAX_NUM_THREADS]; // Unique id for each thread
-    queue_type d_queue_list;                // The work queue
-    size_t d_NULL_TAIL;                     // Null data buffer to check memory bounds
-    int d_max_wait_time;                    // The maximum time in a wait command before printing a warning message
-    std::function<void(const std::string&)> d_errorHandler;
+
+    // Typedefs
+    typedef volatile AtomicOperations::int32_atomic vint32_t;
+    typedef volatile AtomicOperations::int64_atomic vint64_t;
+    typedef volatile wait_ids_struct vwait_t;
+    typedef AtomicOperations::pool<condition_variable,128> cond_t;
+    typedef AtomicList<thread_id_t,std::greater<thread_id_t>> queue_type;
+
+    // Internal data
+    uint32_t d_NULL_HEAD;                 // Null data buffer to check memory bounds
+    volatile mutable bool d_signal_empty; // Do we want to send a signal when the queue is empty
+    uint16_t d_N_threads;                 // Number of threads
+    int d_max_wait_time;                  // The maximum time in a wait command before printing a warning message
+    vint32_t d_signal_count;              // Signal count
+    vint32_t d_num_active;                // Number of threads that are currently active
+    vint64_t d_id_assign;                 // An internal variable used to store the current id to assign
+    vint64_t d_active[MAX_THREADS/64];    // Which threads are currently active
+    vint64_t d_cancel[MAX_THREADS/64];    // Which threads should be deleted
+    vint64_t d_N_added;                   // Number of items added to the work queue
+    vint64_t d_N_started;                 // Number of items started
+    vint64_t d_N_finished;                // Number of items finished
+    mutable vwait_t *d_wait[MAX_WAIT];    // The wait events to check
+    mutable wait_ids_struct *d_wait_last; // A cached copy of the last completed wait event (in case a thread still has a reference)
+    condition_variable d_wait_finished;   // Condition variable to signal when all work is finished
+    condition_variable d_wait_work;       // Condition variable to signal when there is new work
+    mutable cond_t d_cond_pool;           // Condition pool
+    std::thread d_thread[MAX_THREADS];    // Handles to the threads
+    std::thread::id d_threadId[MAX_THREADS]; // Unique id for each thread
+    queue_type d_queue_list;              // The work queue
+    std::function<void(const std::string&)> d_errorHandler; // Error handler
+    uint32_t d_NULL_TAIL;                 // Null data buffer to check memory bounds
 };
 
 
diff --git a/threadpool/thread_pool.hpp b/threadpool/thread_pool.hpp
index a87860b3..394e5619 100644
--- a/threadpool/thread_pool.hpp
+++ b/threadpool/thread_pool.hpp
@@ -21,19 +21,10 @@
  *  \param args         The arguments to pass to the function in the form (arg1,arg2,...)
  *  \param priority     Optional argument specifying the priority of the work item
  */
-#define TPOOL_TUPLE_TO_SEQ( t ) TPOOL_TUPLE_TO_SEQ_##II t
-#define TPOOL_TUPLE_TO_SEQ_II( a, ... ) a, ##__VA_ARGS__
-#if defined( WIN32 ) || defined( _WIN32 ) || defined( WIN64 ) || defined( _WIN64 )
-#define TPOOL_GET_PRIORITY( a, N, c, ... ) N
-#define TPOOL_ADD_WORK( TPOOL, FUNCTION, ARGS, ... )                                      \
-    ThreadPool_add_work( TPOOL, TPOOL_GET_PRIORITY( 0, __VA_ARGS__, 0, 0 ) + 0, FUNCTION, \
-        TPOOL_TUPLE_TO_SEQ( ARGS ) )
-#else
-#define TPOOL_GET_PRIORITY( _0, N, ... ) N
-#define TPOOL_ADD_WORK( TPOOL, FUNCTION, ARGS, ... ) \
-    ThreadPool_add_work(                             \
-        TPOOL, TPOOL_GET_PRIORITY( _0, ##__VA_ARGS__, 0 ), FUNCTION, TPOOL_TUPLE_TO_SEQ( ARGS ) )
-#endif
+#define TPOOL_ADD_WORK2( TPOOL, FUNCTION, ARGS, PRIORITY, ... ) \
+    ThreadPool_add_work( TPOOL, PRIORITY, FUNCTION, std::make_tuple ARGS )
+#define TPOOL_ADD_WORK( TPOOL, FUNCTION, ... ) TPOOL_ADD_WORK2( TPOOL, FUNCTION, __VA_ARGS__, 0, 0 )
+
 
 /*! @} */
 
@@ -59,17 +50,17 @@ struct make_indexes : make_indexes_impl<0, index_tuple<>, Types...> {
 };
 template<class Ret, class... Args, int... Indexes>
 inline Ret apply_helper(
-    Ret ( *pf )( Args... ), index_tuple<Indexes...>, std::tuple<Args...> &&tup )
+    std::function<Ret( Args... )> &pf, index_tuple<Indexes...>, std::tuple<Args...> &&tup )
 {
     return pf( std::forward<Args>( std::get<Indexes>( tup ) )... );
 }
 template<class Ret, class... Args>
-inline Ret apply( Ret ( *pf )( Args... ), const std::tuple<Args...> &tup )
+inline Ret apply( std::function<Ret( Args... )> &pf, const std::tuple<Args...> &tup )
 {
     return apply_helper( pf, typename make_indexes<Args...>::type(), std::tuple<Args...>( tup ) );
 }
 template<class Ret, class... Args>
-inline Ret apply( Ret ( *pf )( Args... ), std::tuple<Args...> &&tup )
+inline Ret apply( std::function<Ret( Args... )> &pf, std::tuple<Args...> &&tup )
 {
     return apply_helper(
         pf, typename make_indexes<Args...>::type(), std::forward<std::tuple<Args...>>( tup ) );
@@ -92,32 +83,40 @@ public:
 template<class Ret, class... Args>
 class WorkItemFull;
 template<class... Args>
-class WorkItemFull<void, Args...> : public ThreadPool::WorkItemRet<void>
+class WorkItemFull<void, Args...> final : public ThreadPool::WorkItemRet<void>
 {
 private:
-    void ( *routine )( Args... );
+    std::function<void( Args... )> routine;
     std::tuple<Args...> args;
     WorkItemFull();
 
 public:
-    WorkItemFull( void ( *routine2 )( Args... ), Args... ts )
-        : ThreadPool::WorkItemRet<void>(), routine( routine2 ), args( ts... )
+    WorkItemFull( std::function<void( Args... )> &&routine2, Args... ts )
+        : ThreadPool::WorkItemRet<void>(), routine( std::move( routine2 ) ), args( ts... )
+    {
+    }
+    WorkItemFull( std::function<void( Args... )> &&routine2, std::tuple<Args...> &&ts )
+        : ThreadPool::WorkItemRet<void>(), routine( std::move( routine2 ) ), args( ts )
     {
     }
     virtual void run() override { apply( routine, args ); }
     virtual ~WorkItemFull() {}
 };
 template<class Ret, class... Args>
-class WorkItemFull : public ThreadPool::WorkItemRet<Ret>
+class WorkItemFull final : public ThreadPool::WorkItemRet<Ret>
 {
 private:
-    Ret ( *routine )( Args... );
+    std::function<Ret( Args... )> routine;
     std::tuple<Args...> args;
     WorkItemFull();
 
 public:
-    WorkItemFull( Ret ( *routine2 )( Args... ), Args... ts )
-        : ThreadPool::WorkItemRet<Ret>(), routine( routine2 ), args( ts... )
+    WorkItemFull( std::function<Ret( Args... )> &&routine2, Args... ts )
+        : ThreadPool::WorkItemRet<Ret>(), routine( std::move( routine2 ) ), args( ts... )
+    {
+    }
+    WorkItemFull( std::function<Ret( Args... )> &&routine2, std::tuple<Args...> &&ts )
+        : ThreadPool::WorkItemRet<Ret>(), routine( std::move( routine2 ) ), args( ts )
     {
     }
     virtual void run() override { this->d_result = apply( routine, args ); }
@@ -126,11 +125,40 @@ public:
 
 
 // Functions to add work to the thread pool
-template<class Ret, class... Ts>
+// clang-format off
+template<class Ret, class... Args>
 inline ThreadPool::thread_id_t ThreadPool_add_work(
-    ThreadPool *tpool, int priority, Ret ( *routine )( Ts... ), Ts... ts )
+    ThreadPool *tpool, int priority, std::function<Ret( Args... )> routine, std::tuple<Args...> &&args )
 {
-    auto work = new WorkItemFull<Ret, Ts...>( routine, ts... );
+    auto work = new WorkItemFull<Ret, Args...>( routine, std::move( args ) );
+    return ThreadPool::add_work( tpool, work, priority );
+}
+template<class Ret, class... Args>
+inline ThreadPool::thread_id_t ThreadPool_add_work(
+    ThreadPool *tpool, int priority, Ret ( *routine )( Args... ), std::tuple<Args...> &&args )
+{
+    auto work = new WorkItemFull<Ret, Args...>( routine, std::move( args ) );
+    return ThreadPool::add_work( tpool, work, priority );
+}
+template<class Ret, class... Args>
+inline ThreadPool::thread_id_t ThreadPool_add_work(
+    ThreadPool *tpool, int priority, Ret ( *routine )(), std::tuple<std::nullptr_t>&& )
+{
+    auto work = new WorkItemFull<Ret>( routine );
+    return ThreadPool::add_work( tpool, work, priority );
+}
+template<class Ret, class... Args>
+inline ThreadPool::thread_id_t ThreadPool_add_work(
+ThreadPool *tpool, int priority, std::function<Ret( Args... )> routine, Args... args )
+{
+    auto work = new WorkItemFull<Ret, Args...>( routine, std::forward_as_tuple( args... ) );
+    return ThreadPool::add_work( tpool, work, priority );
+}
+template<class Ret, class... Args>
+inline ThreadPool::thread_id_t ThreadPool_add_work(
+    ThreadPool *tpool, int priority, Ret ( *routine )( Args... ), Args... args )
+{
+    auto work = new WorkItemFull<Ret, Args...>( routine, std::forward_as_tuple( args... ) );
     return ThreadPool::add_work( tpool, work, priority );
 }
 template<class Ret>
@@ -141,10 +169,29 @@ inline ThreadPool::thread_id_t ThreadPool_add_work(
     return ThreadPool::add_work( tpool, work, priority );
 }
 template<class Ret, class... Args>
+inline ThreadPool::WorkItem *ThreadPool::createWork(
+    std::function<Ret( Args... )> routine, Args... args )
+{
+    return new WorkItemFull<Ret, Args...>( routine, std::forward_as_tuple( args... ) );
+}
+template<class Ret, class... Args>
 inline ThreadPool::WorkItem *ThreadPool::createWork( Ret ( *routine )( Args... ), Args... args )
 {
-    return new WorkItemFull<Ret, Args...>( routine, args... );
+    return new WorkItemFull<Ret, Args...>( routine, std::forward_as_tuple( args... ) );
 }
+template<class Ret, class... Args>
+inline ThreadPool::WorkItem *ThreadPool::createWork(
+    std::function<Ret( Args... )> routine, std::tuple<Args...> &&args )
+{
+    return new WorkItemFull<Ret, Args...>( routine, std::move( args ) );
+}
+template<class Ret, class... Args>
+inline ThreadPool::WorkItem *ThreadPool::createWork(
+    Ret ( *routine )( Args... ), std::tuple<Args...> &&args )
+{
+    return new WorkItemFull<Ret, Args...>( routine, std::move( args ) );
+}
+// clang-format on
 
 
 /******************************************************************
@@ -174,71 +221,49 @@ inline Ret ThreadPool::getFunctionRet( const ThreadPool::thread_id_t &id )
 /******************************************************************
  * Inline functions to wait for the work items to finish           *
  ******************************************************************/
-inline int ThreadPool::wait( ThreadPool::thread_id_t id ) const
+inline void ThreadPool::wait( ThreadPool::thread_id_t id ) const
 {
     bool finished;
-    wait_some( 1, &id, 1, &finished );
-    return 0;
+    int N = wait_some( 1, &id, 1, &finished, 10000000 );
+    if ( N != 1 )
+        throw std::logic_error( "Failed to wait for id" );
 }
-inline int ThreadPool::wait_any( size_t N_work, const ThreadPool::thread_id_t *ids )
-{
-    auto finished = new bool[N_work];
-    wait_some( N_work, ids, 1, finished );
-    int index = -1;
-    for ( size_t i = 0; i < N_work; i++ ) {
-        if ( finished[i] ) {
-            index = static_cast<int>( i );
-            break;
-        }
-    }
-    delete[] finished;
-    return index;
-}
-inline int ThreadPool::wait_any( const std::vector<thread_id_t> &ids ) const
+inline size_t ThreadPool::wait_any( const std::vector<thread_id_t> &ids ) const
 {
     if ( ids.empty() )
         return 0;
     auto finished = new bool[ids.size()];
-    wait_some( ids.size(), &ids[0], 1, finished );
-    int index = -1;
+    int N         = wait_some( ids.size(), &ids[0], 1, finished, 10000000 );
+    if ( N < 1 )
+        throw std::logic_error( "Failed to wait for any id" );
     for ( size_t i = 0; i < ids.size(); i++ ) {
         if ( finished[i] ) {
-            index = static_cast<int>( i );
-            break;
+            delete[] finished;
+            return i;
         }
     }
-    delete[] finished;
-    return index;
+    throw std::logic_error( "wait_any failed" );
 }
-inline int ThreadPool::wait_all( size_t N_work, const ThreadPool::thread_id_t *ids ) const
-{
-    if ( N_work == 0 )
-        return 0;
-    auto finished = new bool[N_work];
-    wait_some( N_work, ids, N_work, finished );
-    delete[] finished;
-    return 0;
-}
-inline int ThreadPool::wait_all( const std::vector<thread_id_t> &ids ) const
+inline void ThreadPool::wait_all( const std::vector<thread_id_t> &ids ) const
 {
     if ( ids.empty() )
-        return 0;
+        return;
     auto finished = new bool[ids.size()];
-    wait_some( ids.size(), ids.data(), ids.size(), finished );
+    int N         = wait_some( ids.size(), ids.data(), ids.size(), finished, 10000000 );
+    if ( N != (int) ids.size() )
+        throw std::logic_error( "Failed to wait for all ids" );
     delete[] finished;
-    return 0;
 }
-inline int ThreadPool::wait_all( const ThreadPool *tpool, const std::vector<thread_id_t> &ids )
+inline void ThreadPool::wait_all( const ThreadPool *tpool, const std::vector<thread_id_t> &ids )
 {
     if ( tpool )
         return tpool->wait_all( ids );
-    return ids.size();
 }
 inline std::vector<int> ThreadPool::wait_some(
-    int N_wait, const std::vector<thread_id_t> &ids ) const
+    int N_wait, const std::vector<thread_id_t> &ids, int max_wait ) const
 {
     auto finished  = new bool[ids.size()];
-    int N_finished = wait_some( ids.size(), ids.data(), N_wait, finished );
+    int N_finished = wait_some( ids.size(), ids.data(), N_wait, finished, max_wait );
     std::vector<int> index( N_finished, -1 );
     for ( size_t i = 0, j = 0; i < ids.size(); i++ ) {
         if ( finished[i] ) {
@@ -313,7 +338,7 @@ inline std::vector<ThreadPool::thread_id_t> ThreadPool::add_work( ThreadPool *tp
  * Class functions to for the thread id                            *
  ******************************************************************/
 inline ThreadPool::thread_id_t::thread_id_t()
-    : d_id( nullThreadID ), d_count( NULL ), d_work( NULL )
+    : d_id( nullThreadID ), d_count( nullptr ), d_work( nullptr )
 {
 }
 inline ThreadPool::thread_id_t::~thread_id_t() { reset(); }
@@ -350,7 +375,7 @@ inline ThreadPool::thread_id_t &ThreadPool::thread_id_t::operator=(
 inline ThreadPool::thread_id_t::thread_id_t( const volatile ThreadPool::thread_id_t &rhs )
     : d_id( rhs.d_id ), d_count( rhs.d_count ), d_work( rhs.d_work )
 {
-    if ( d_count != NULL )
+    if ( d_count != nullptr )
         AtomicOperations::atomic_increment( d_count );
 }
 #if !defined( WIN32 ) && !defined( _WIN32 ) && !defined( WIN64 ) && !defined( _WIN64 )
@@ -435,15 +460,9 @@ inline void ThreadPool::thread_id_t::reset()
 }
 inline uint64_t ThreadPool::thread_id_t::createId( int priority, uint64_t local_id )
 {
-    if ( priority < -127 || priority > 127 )
-        throw std::logic_error( "priority limited to +- 127" );
-    if ( local_id > maxThreadID )
-        throw std::logic_error( "local id >= 2^56" );
-    char tmp1          = static_cast<char>( priority + 128 );
-    unsigned char tmp2 = static_cast<unsigned char>( tmp1 );
-    if ( priority >= 0 )
-        tmp2 |= 0x80;
-    uint64_t id = tmp2;
+    if ( priority < -127 || priority > 127 || local_id > maxThreadID )
+        throw std::logic_error( "Invalid priority or local id" );
+    uint64_t id = priority + 128;
     id          = ( id << 56 ) + local_id;
     return id;
 }
@@ -460,9 +479,8 @@ inline void ThreadPool::thread_id_t::reset( int priority, uint64_t local_id, voi
     d_id = createId( priority, local_id );
     // Create the work and counter
     d_count = nullptr;
-    d_work  = nullptr;
-    if ( work != nullptr ) {
-        d_work   = work;
+    d_work  = work;
+    if ( d_work != nullptr ) {
         d_count  = &( reinterpret_cast<WorkItem *>( work )->d_count );
         *d_count = 1;
     }
@@ -512,7 +530,6 @@ inline bool ThreadPool::thread_id_t::ready() const
  ******************************************************************/
 inline bool ThreadPool::isValid( const ThreadPool::thread_id_t &id ) const
 {
-    static_assert( sizeof( atomic_64 ) == 8, "atomic_64 must be a 64-bit integer" );
     uint64_t local_id = id.getLocalID();
     uint64_t next_id  = d_id_assign - 1;
     return local_id != 0 && id.initialized() && local_id <= thread_id_t::maxThreadID &&