Updating threadpool, Array, StackTrace, ... classes

2018-02-06 10:50:43 -05:00
parent 396bb07b26
commit 98d86d2f94
21 changed files with 3839 additions and 2444 deletions
--- a/common/Array.h
+++ b/common/Array.h
@@ -1,34 +1,15 @@
 #ifndef included_ArrayClass
 #define included_ArrayClass

-#include <vector>
 #include <array>
+#include <cstring>
 #include <functional>
+#include <initializer_list>
 #include <iostream>
-#include <stdexcept>
 #include <memory>
-#include <iostream>
+#include <vector>

-
-#define ARRAY_NDIM_MAX 5 // Maximum number of dimensions supported
-
-
-#define GET_ARRAY_INDEX3D( N, i1, i2, i3 ) i1 + N[0] * ( i2 + N[1] * i3 )
-#define GET_ARRAY_INDEX4D( N, i1, i2, i3, i4 ) i1 + N[0] * ( i2 + N[1] * ( i3 + N[2] * i4 ) )
-#define GET_ARRAY_INDEX5D( N, i1, i2, i3, i4, i5 ) i1 + N[0] * ( i2 + N[1] * ( i3 + N[2] * ( i4 + N[3] * i5 ) ) )
-
-#if defined( DEBUG ) || defined( _DEBUG )
-    #define CHECK_ARRAY_INDEX3D( N, i1, i2, i3 )                  \
-        if ( GET_ARRAY_INDEX3D( N, i1, i2, i3 ) < 0 || GET_ARRAY_INDEX3D( N, i1, i2, i3 ) >= d_length ) \
-            throw std::logic_error( "Index exceeds array bounds" );
-    #define CHECK_ARRAY_INDEX4D( N, i1, i2, i3, i4 )              \
-        if ( GET_ARRAY_INDEX4D( N, i1, i2, i3, i4 ) < 0 ||        \
-             GET_ARRAY_INDEX4D( N, i1, i2, i3, i4 ) >= d_length ) \
-            throw std::logic_error( "Index exceeds array bounds" );
-#else
-    #define CHECK_ARRAY_INDEX3D( N, i1, i2, i3 )
-    #define CHECK_ARRAY_INDEX4D( N, i1, i2, i3, i4 )
-#endif
+#include "Utilities.h"


 #if defined( __CUDA_ARCH__ )
@@ -37,20 +18,244 @@
 #else
 #define HOST_DEVICE
 #endif
+#if defined( USING_GCC ) || defined( USING_CLANG )
+#define ATTRIBUTE_INLINE __attribute__( ( always_inline ) )
+#else
+#define ATTRIBUTE_INLINE
+#endif
+
+
+#if ( defined( DEBUG ) || defined( _DEBUG ) ) && !defined( NDEBUG )
+#define CHECK_ARRAY_LENGTH( i )                                      \
+    do {                                                             \
+        if ( i >= d_length )                                         \
+            throw std::length_error( "Index exceeds array bounds" ); \
+    } while ( 0 )
+#else
+#define CHECK_ARRAY_LENGTH( i ) \
+    do {                        \
+    } while ( 0 )
+#endif
+
+
+// Forward decleration
+class FunctionTable;
+
+
+//! Simple range class
+template<class TYPE = size_t>
+class Range final
+{
+public:
+    //! Empty constructor
+    Range() : i( 0 ), j( -1 ), k( 1 ) {}
+
+    /*!
+     * Create a range i:k:j (or i:j)
+     * @param i_            Starting value
+     * @param j_            Ending value
+     * @param k_            Increment value
+     */
+    Range( TYPE i_, TYPE j_, TYPE k_ = 1 ) : i( i_ ), j( j_ ), k( k_ ) {}
+
+    TYPE i, j, k;
+};
+
+
+//! Simple class to store the array dimensions
+class ArraySize final
+{
+public:
+    //! Empty constructor
+    inline ArraySize();
+
+    /*!
+     * Create the vector size
+     * @param N1            Number of elements in the first dimension
+     */
+    inline ArraySize( size_t N1 );
+
+    /*!
+     * Create the vector size
+     * @param N1            Number of elements in the first dimension
+     * @param N2            Number of elements in the second dimension
+     */
+    inline ArraySize( size_t N1, size_t N2 );
+
+    /*!
+     * Create the vector size
+     * @param N1            Number of elements in the first dimension
+     * @param N2            Number of elements in the second dimension
+     * @param N3            Number of elements in the third dimension
+     */
+    inline ArraySize( size_t N1, size_t N2, size_t N3 );
+
+    /*!
+     * Create the vector size
+     * @param N1            Number of elements in the first dimension
+     * @param N2            Number of elements in the second dimension
+     * @param N3            Number of elements in the third dimension
+     * @param N4            Number of elements in the fourth dimension
+     */
+    inline ArraySize( size_t N1, size_t N2, size_t N3, size_t N4 );
+
+    /*!
+     * Create the vector size
+     * @param N1            Number of elements in the first dimension
+     * @param N2            Number of elements in the second dimension
+     * @param N3            Number of elements in the third dimension
+     * @param N4            Number of elements in the fourth dimension
+     * @param N5            Number of elements in the fifth dimension
+     */
+    inline ArraySize( size_t N1, size_t N2, size_t N3, size_t N4, size_t N5 );
+
+    /*!
+     * Create from initializer list
+     * @param N             Size of the array
+     */
+    inline ArraySize( std::initializer_list<size_t> N );
+
+    /*!
+     * Create from raw pointer
+     * @param ndim          Number of dimensions
+     * @param ndim          Dimensions
+     */
+    inline ArraySize( size_t ndim, const size_t *dims );
+
+    /*!
+     * Create from std::vector
+     * @param N             Size of the array
+     */
+    inline ArraySize( const std::vector<size_t> &N );
+
+    /*!
+     * Copy constructor
+     * @param rhs           Array to copy
+     */
+    inline ArraySize( const ArraySize &rhs );
+
+    /*!
+     * Move constructor
+     * @param rhs           Array to copy
+     */
+    inline ArraySize( ArraySize &&rhs );
+
+    /*!
+     * Assignment operator
+     * @param rhs           Array to copy
+     */
+    inline ArraySize &operator=( const ArraySize &rhs );
+
+    /*!
+     * Move assignment operator
+     * @param rhs           Array to copy
+     */
+    inline ArraySize &operator=( ArraySize &&rhs );
+
+    /*!
+     * Access the ith dimension
+     * @param i             Index to access
+     */
+    inline size_t operator[]( size_t i ) const { return d_N[i]; }
+
+    //! Sum the elements
+    inline uint8_t ndim() const ATTRIBUTE_INLINE { return d_ndim; }
+
+    //! Sum the elements
+    inline size_t size() const ATTRIBUTE_INLINE { return d_ndim; }
+
+    //! Sum the elements
+    inline size_t length() const ATTRIBUTE_INLINE { return d_length; }
+
+    //! Sum the elements
+    inline void resize( uint8_t dim, size_t N );
+
+    //! Returns an iterator to the beginning
+    inline const size_t *begin() const ATTRIBUTE_INLINE { return d_N; }
+
+    //! Returns an iterator to the end
+    inline const size_t *end() const ATTRIBUTE_INLINE { return d_N + d_ndim; }
+
+    // Check if two matrices are equal
+    inline bool operator==( const ArraySize &rhs ) const ATTRIBUTE_INLINE
+    {
+        return d_ndim == rhs.d_ndim && memcmp( d_N, rhs.d_N, sizeof( d_N ) ) == 0;
+    }
+
+    //! Check if two matrices are not equal
+    inline bool operator!=( const ArraySize &rhs ) const ATTRIBUTE_INLINE
+    {
+        return d_ndim != rhs.d_ndim || memcmp( d_N, rhs.d_N, sizeof( d_N ) ) != 0;
+    }
+
+    //! Maximum supported dimension
+    constexpr static uint8_t maxDim() ATTRIBUTE_INLINE { return 5u; }
+
+    //! Get the index
+    inline size_t index( size_t i ) const ATTRIBUTE_INLINE
+    {
+        CHECK_ARRAY_LENGTH( i );
+        return i;
+    }
+
+    //! Get the index
+    inline size_t index( size_t i1, size_t i2 ) const ATTRIBUTE_INLINE
+    {
+        size_t index = i1 + i2 * d_N[0];
+        CHECK_ARRAY_LENGTH( index );
+        return index;
+    }
+
+    //! Get the index
+    inline size_t index( size_t i1, size_t i2, size_t i3 ) const ATTRIBUTE_INLINE
+    {
+        size_t index = i1 + d_N[0] * ( i2 + d_N[1] * i3 );
+        CHECK_ARRAY_LENGTH( index );
+        return index;
+    }
+
+    //! Get the index
+    inline size_t index( size_t i1, size_t i2, size_t i3, size_t i4 ) const ATTRIBUTE_INLINE
+    {
+        size_t index = i1 + d_N[0] * ( i2 + d_N[1] * ( i3 + d_N[2] * i4 ) );
+        CHECK_ARRAY_LENGTH( index );
+        return index;
+    }
+
+    //! Get the index
+    inline size_t index(
+        size_t i1, size_t i2, size_t i3, size_t i4, size_t i5 ) const ATTRIBUTE_INLINE
+    {
+        size_t index = i1 + d_N[0] * ( i2 + d_N[1] * ( i3 + d_N[2] * ( i4 + d_N[3] * i5 ) ) );
+        CHECK_ARRAY_LENGTH( index );
+        return index;
+    }
+
+private:
+    uint8_t d_ndim;
+    size_t d_length;
+    size_t d_N[5];
+};


 /*!
 * Class Array is a multi-dimensional array class written by Mark Berrill
 */
-template <class TYPE>
-class Array
+template<class TYPE, class FUN = FunctionTable>
+class Array final
 {
-public:
+public: // Constructors / assignment operators
    /*!
     * Create a new empty Array
     */
    Array();

+    /*!
+     * Create an Array with the given size
+     * @param N             Size of the array
+     */
+    explicit Array( const ArraySize &N );
+
    /*!
     * Create a new 1D Array with the given number of elements
     * @param N             Number of elements in the array
@@ -72,6 +277,25 @@ public:
     */
    explicit Array( size_t N1, size_t N2, size_t N3 );

+    /*!
+     * Create a new 4D Array with the given number of rows and columns
+     * @param N1            Number of elements in the first dimension
+     * @param N2            Number of elements in the second dimension
+     * @param N3            Number of elements in the third dimension
+     * @param N4            Number of elements in the fourth dimension
+     */
+    explicit Array( size_t N1, size_t N2, size_t N3, size_t N4 );
+
+    /*!
+     * Create a new 4D Array with the given number of rows and columns
+     * @param N1            Number of elements in the first dimension
+     * @param N2            Number of elements in the second dimension
+     * @param N3            Number of elements in the third dimension
+     * @param N4            Number of elements in the fourth dimension
+     * @param N5            Number of elements in the fifth dimension
+     */
+    explicit Array( size_t N1, size_t N2, size_t N3, size_t N4, size_t N5 );
+
    /*!
     * Create a multi-dimensional Array with the given number of elements
     * @param N             Number of elements in each dimension
@@ -79,6 +303,19 @@ public:
     */
    explicit Array( const std::vector<size_t> &N, const TYPE *data = NULL );

+    /*!
+     * Create a 1D Array with the range
+     * @param range         Range of the data
+     */
+    explicit Array( const Range<TYPE> &range );
+
+    /*!
+     * Create a 1D Array with the given initializer list
+     * @param data          Input data
+     */
+    Array( std::initializer_list<TYPE> data );
+
+
    /*!
     * Copy constructor
     * @param rhs           Array to copy
@@ -109,7 +346,7 @@ public:
     */
    Array &operator=( const std::vector<TYPE> &rhs );

-
+public: // Views/copies/subset
    /*!
     * Create a 1D Array view to a raw block of data
     * @param N             Number of elements in the array
@@ -141,8 +378,7 @@ public:
     * @param N             Number of elements in each dimension
     * @param data          Pointer to the data
     */
-    static std::shared_ptr<Array> view(
-        const std::vector<size_t> &N, std::shared_ptr<TYPE> const &data );
+    static std::shared_ptr<Array> view( const ArraySize &N, std::shared_ptr<TYPE> const &data );


    /*!
@@ -178,7 +414,7 @@ public:
     * @param data          Pointer to the data
     */
    static std::shared_ptr<const Array> constView(
-        const std::vector<size_t> &N, std::shared_ptr<const TYPE> const &data );
+        const ArraySize &N, std::shared_ptr<const TYPE> const &data );


    /*!
@@ -192,7 +428,20 @@ public:
     * @param N             Number of elements in each dimension
     * @param data          Pointer to the data
     */
-    void view2( const std::vector<size_t> &N, std::shared_ptr<TYPE> const &data );
+    void view2( const ArraySize &N, std::shared_ptr<TYPE> const &data );
+
+    /*!
+     * Make this object a view of the raw data (expert use only).
+     * Use view2( N, std::shared_ptr(data,[](TYPE*){}) ) instead.
+     *   Note: this interface is not recommended as it does not protect from
+     *   the src data being deleted while still being used by the Array.
+     *   Additionally for maximum performance it does not set the internal shared_ptr
+     *   so functions like getPtr and resize will not work correctly.
+     * @param ndim          Number of dimensions
+     * @param dims          Number of elements in each dimension
+     * @param data          Pointer to the data
+     */
+    void viewRaw( int ndim, const size_t *dims, TYPE *data );

    /*!
     * Make this object a view of the raw data (expert use only).
@@ -204,26 +453,14 @@ public:
     * @param N             Number of elements in each dimension
     * @param data          Pointer to the data
     */
-    void viewRaw( const std::initializer_list<size_t> &N, TYPE *data );
-
-    /*!
-     * Make this object a view of the raw data (expert use only).
-     * Use view2( N, std::shared_ptr(data,[](TYPE*){}) ) instead.
-     *   Note: this interface is not recommended as it does not protect from
-     *   the src data being deleted while still being used by the Array.
-     *   Additionally for maximum performance it does not set the internal shared_ptr
-     *   so functions like getPtr and resize will not work correctly.
-     * @param N             Number of elements in each dimension
-     * @param data          Pointer to the data
-     */
-    void viewRaw( const std::vector<size_t> &N, TYPE *data );
+    void viewRaw( const ArraySize &N, TYPE *data );

    /*!
     * Convert an array of one type to another.  This may or may not allocate new memory.
     * @param array         Input array
     */
    template<class TYPE2>
-    static std::shared_ptr<Array<TYPE2>> convert( std::shared_ptr<Array<TYPE>> array );
+    static std::shared_ptr<Array<TYPE2>> convert( std::shared_ptr<Array<TYPE, FUN>> array );


    /*!
@@ -231,7 +468,8 @@ public:
     * @param array         Input array
     */
    template<class TYPE2>
-    static std::shared_ptr<const Array<TYPE2>> convert( std::shared_ptr<const Array<TYPE>> array );
+    static std::shared_ptr<const Array<TYPE2>> convert(
+        std::shared_ptr<const Array<TYPE, FUN>> array );


    /*!
@@ -256,6 +494,13 @@ public:
    template<class TYPE2>
    void copyTo( TYPE2 *array ) const;

+    /*!
+     * Copy and convert data from this array to a raw vector.
+     * @param array         Source array
+     */
+    template<class TYPE2>
+    Array<TYPE2, FUN> cloneTo() const;
+

    /*!
     * Fill the array with the given value
@@ -274,7 +519,7 @@ public:
     * @param base        Base array
     * @param exp         Exponent value
     */
-    void pow( const Array<TYPE> &baseArray, const TYPE &exp );
+    void pow( const Array<TYPE, FUN> &base, const TYPE &exp );

    //! Destructor
    ~Array();
@@ -285,23 +530,27 @@ public:


    //! Return the size of the Array
-    inline int ndim() const { return d_ndim; }
+    inline int ndim() const { return d_size.ndim(); }


    //! Return the size of the Array
-    inline std::vector<size_t> size() const { return std::vector<size_t>( d_N, d_N + d_ndim ); }
+    inline ArraySize &size() { return d_size; }


    //! Return the size of the Array
-    inline size_t size( int d ) const { return d_N[d]; }
+    inline ArraySize size() const { return d_size; }


    //! Return the size of the Array
-    inline size_t length() const { return d_length; }
+    inline size_t size( int d ) const { return d_size[d]; }
+
+
+    //! Return the size of the Array
+    inline size_t length() const { return d_size.length(); }


    //! Return true if the Array is empty
-    inline bool empty() const { return d_length == 0; }
+    inline bool empty() const { return d_size.length() == 0; }


    /*!
@@ -329,7 +578,8 @@ public:
     * Resize the Array
     * @param N             Number of elements in each dimension
     */
-    void resize( const std::vector<size_t> &N );
+    void resize( const ArraySize &N );
+

    /*!
     * Resize the given dimension of the array
@@ -344,7 +594,7 @@ public:
     * Reshape the Array (total size of array will not change)
     * @param N             Number of elements in each dimension
     */
-    void reshape( const std::vector<size_t> &N );
+    void reshape( const ArraySize &N );


    /*!
@@ -352,7 +602,16 @@ public:
     * @param index         Index to subset (imin,imax,jmin,jmax,kmin,kmax,...)
     */
    template<class TYPE2 = TYPE>
-    Array<TYPE2> subset( const std::vector<size_t> &index ) const;
+    Array<TYPE2, FUN> subset( const std::vector<size_t> &index ) const;
+
+
+    /*!
+     * Subset the Array (total size of array will not change)
+     * @param index         Index to subset (ix:kx:jx,iy:ky:jy,...)
+     */
+    template<class TYPE2 = TYPE>
+    Array<TYPE2, FUN> subset( const std::vector<Range<size_t>> &index ) const;
+

    /*!
     * Copy data from an array into a subset of this array
@@ -360,32 +619,48 @@ public:
     * @param subset        The subset array to copy from
     */
    template<class TYPE2>
-    void copySubset( const std::vector<size_t> &index, const Array<TYPE2> &subset );
+    void copySubset( const std::vector<size_t> &index, const Array<TYPE2, FUN> &subset );
+
+    /*!
+     * Copy data from an array into a subset of this array
+     * @param index         Index of the subset
+     * @param subset        The subset array to copy from
+     */
+    template<class TYPE2>
+    void copySubset( const std::vector<Range<size_t>> &index, const Array<TYPE2, FUN> &subset );

    /*!
     * Add data from an array into a subset of this array
     * @param index         Index of the subset (imin,imax,jmin,jmax,kmin,kmax,...)
     * @param subset        The subset array to add from
     */
-    void addSubset( const std::vector<size_t> &index, const Array<TYPE> &subset );
+    void addSubset( const std::vector<size_t> &index, const Array<TYPE, FUN> &subset );
+
+    /*!
+     * Add data from an array into a subset of this array
+     * @param index         Index of the subset
+     * @param subset        The subset array to add from
+     */
+    void addSubset( const std::vector<Range<size_t>> &index, const Array<TYPE, FUN> &subset );


+public: // Accessors
    /*!
     * Access the desired element
     * @param i             The row index
     */
-    HOST_DEVICE inline TYPE &operator()( size_t i )
+    HOST_DEVICE inline TYPE &operator()( size_t i ) ATTRIBUTE_INLINE
    {
-        CHECK_ARRAY_INDEX3D( d_N, i, 0, 0 ) return d_data[i];
+        return d_data[d_size.index( i )];
    }

    /*!
     * Access the desired element
     * @param i             The row index
     */
-    HOST_DEVICE inline const TYPE &operator()( size_t i ) const
+    HOST_DEVICE inline const TYPE &operator()( size_t i ) const ATTRIBUTE_INLINE
    {
-        CHECK_ARRAY_INDEX3D( d_N, i, 0, 0 ) return d_data[i];
+        return d_data[d_size.index( i )];
    }

    /*!
@@ -393,9 +668,9 @@ public:
     * @param i             The row index
     * @param j             The column index
     */
-    HOST_DEVICE inline TYPE &operator()( size_t i, size_t j )
+    HOST_DEVICE inline TYPE &operator()( size_t i, size_t j ) ATTRIBUTE_INLINE
    {
-        CHECK_ARRAY_INDEX3D( d_N, i, j, 0 ) return d_data[i + j * d_N[0]];
+        return d_data[d_size.index( i, j )];
    }

    /*!
@@ -403,9 +678,9 @@ public:
     * @param i             The row index
     * @param j             The column index
     */
-    HOST_DEVICE inline const TYPE &operator()( size_t i, size_t j ) const
+    HOST_DEVICE inline const TYPE &operator()( size_t i, size_t j ) const ATTRIBUTE_INLINE
    {
-        CHECK_ARRAY_INDEX3D( d_N, i, j, 0 ) return d_data[i + j * d_N[0]];
+        return d_data[d_size.index( i, j )];
    }

    /*!
@@ -414,9 +689,9 @@ public:
     * @param j             The column index
     * @param k             The third index
     */
-    HOST_DEVICE inline TYPE &operator()( size_t i, size_t j, size_t k )
+    HOST_DEVICE inline TYPE &operator()( size_t i, size_t j, size_t k ) ATTRIBUTE_INLINE
    {
-        CHECK_ARRAY_INDEX3D( d_N, i, j, k ) return d_data[GET_ARRAY_INDEX3D( d_N, i, j, k )];
+        return d_data[d_size.index( i, j, k )];
    }

    /*!
@@ -425,35 +700,109 @@ public:
     * @param j             The column index
     * @param k             The third index
     */
-    HOST_DEVICE inline const TYPE &operator()( size_t i, size_t j, size_t k ) const
+    HOST_DEVICE inline const TYPE &operator()( size_t i, size_t j, size_t k ) const ATTRIBUTE_INLINE
    {
-        CHECK_ARRAY_INDEX3D( d_N, i, j, k ) return d_data[GET_ARRAY_INDEX3D( d_N, i, j, k )];
+        return d_data[d_size.index( i, j, k )];
    }

    /*!
     * Access the desired element
-     * @param i             The row index
-     * @param j             The column index
-     * @param k             The third index
-     * @param l             The fourth index
+     * @param i1            The first index
+     * @param i2            The second index
+     * @param i3            The third index
+     * @param i4            The fourth index
     */
-    HOST_DEVICE inline TYPE &operator()( size_t i, size_t j, size_t k, size_t l )
+    HOST_DEVICE inline TYPE &operator()(
+        size_t i1, size_t i2, size_t i3, size_t i4 ) ATTRIBUTE_INLINE
    {
-        CHECK_ARRAY_INDEX4D( d_N, i, j, k, l ) return d_data[GET_ARRAY_INDEX4D( d_N, i, j, k, l )];
+        return d_data[d_size.index( i1, i2, i3, i4 )];
    }

    /*!
     * Access the desired element
-     * @param i             The row index
-     * @param j             The column index
-     * @param k             The third index
-     * @param l             The fourth index
+     * @param i1            The first index
+     * @param i2            The second index
+     * @param i3            The third index
+     * @param i4            The fourth index
     */
-    HOST_DEVICE inline const TYPE &operator()( size_t i, size_t j, size_t k, size_t l ) const
+    HOST_DEVICE inline const TYPE &operator()(
+        size_t i1, size_t i2, size_t i3, size_t i4 ) const ATTRIBUTE_INLINE
    {
-        CHECK_ARRAY_INDEX4D( d_N, i, j, k, l ) return d_data[GET_ARRAY_INDEX4D( d_N, i, j, k, l )];
+        return d_data[d_size.index( i1, i2, i3, i4 )];
    }

+    /*!
+     * Access the desired element
+     * @param i1            The first index
+     * @param i2            The second index
+     * @param i3            The third index
+     * @param i4            The fourth index
+     * @param i5            The fifth index
+     */
+    HOST_DEVICE inline TYPE &operator()(
+        size_t i1, size_t i2, size_t i3, size_t i4, size_t i5 ) ATTRIBUTE_INLINE
+    {
+        return d_data[d_size.index( i1, i2, i3, i4, i5 )];
+    }
+
+    /*!
+     * Access the desired element
+     * @param i1            The first index
+     * @param i2            The second index
+     * @param i3            The third index
+     * @param i4            The fourth index
+     * @param i5            The fifth index
+     */
+    HOST_DEVICE inline const TYPE &operator()(
+        size_t i1, size_t i2, size_t i3, size_t i4, size_t i5 ) const ATTRIBUTE_INLINE
+    {
+        return d_data[d_size.index( i1, i2, i3, i4, i5 )];
+    }
+
+    /*!
+     * Access the desired element as a raw pointer
+     * @param i             The global index
+     */
+    HOST_DEVICE inline TYPE *ptr( size_t i ) ATTRIBUTE_INLINE
+    {
+        return i >= d_size.length() ? nullptr : &d_data[i];
+    }
+
+    /*!
+     * Access the desired element as a raw pointer
+     * @param i             The global index
+     */
+    HOST_DEVICE inline const TYPE *ptr( size_t i ) const ATTRIBUTE_INLINE
+    {
+        return i >= d_size.length() ? nullptr : &d_data[i];
+    }
+
+    //! Get iterator to beginning of data
+    inline TYPE *begin() ATTRIBUTE_INLINE { return d_data; }
+
+    //! Get iterator to beginning of data
+    inline const TYPE *begin() const ATTRIBUTE_INLINE { return d_data; }
+
+    //! Get iterator to beginning of data
+    inline TYPE *end() ATTRIBUTE_INLINE { return d_data + d_size.length(); }
+
+    //! Get iterator to beginning of data
+    inline const TYPE *end() const ATTRIBUTE_INLINE { return d_data + d_size.length(); }
+
+    //! Return the pointer to the raw data
+    inline std::shared_ptr<TYPE> getPtr() ATTRIBUTE_INLINE { return d_ptr; }
+
+    //! Return the pointer to the raw data
+    inline std::shared_ptr<const TYPE> getPtr() const ATTRIBUTE_INLINE { return d_ptr; }
+
+    //! Return the pointer to the raw data
+    HOST_DEVICE inline TYPE *data() ATTRIBUTE_INLINE { return d_data; }
+
+    //! Return the pointer to the raw data
+    HOST_DEVICE inline const TYPE *data() const ATTRIBUTE_INLINE { return d_data; }
+
+
+public: // Operator overloading
    //! Check if two matrices are equal
    // Equality means the dimensions and data have to be identical
    bool operator==( const Array &rhs ) const;
@@ -461,19 +810,28 @@ public:
    //! Check if two matrices are not equal
    inline bool operator!=( const Array &rhs ) const { return !this->operator==( rhs ); }

+    //! Add another array
+    Array &operator+=( const Array &rhs );

-    //! Return the pointer to the raw data
-    inline std::shared_ptr<TYPE> getPtr() { return d_ptr; }
+    //! Subtract another array
+    Array &operator-=( const Array &rhs );

-    //! Return the pointer to the raw data
-    inline std::shared_ptr<const TYPE> getPtr() const { return d_ptr; }
+    //! Add a scalar
+    Array &operator+=( const TYPE &rhs );

-    //! Return the pointer to the raw data
-    HOST_DEVICE inline TYPE *data() { return d_data; }
+    //! Subtract a scalar
+    Array &operator-=( const TYPE &rhs );

-    //! Return the pointer to the raw data
-    HOST_DEVICE inline const TYPE *data() const { return d_data; }

+public: // Math operations
+    //! Concatenates the arrays along the dimension dim.
+    static Array cat( const std::vector<Array> &x, int dim = 0 );
+
+    //! Concatenates a given array with the current array
+    void cat( const Array &x, int dim = 0 );
+
+    //! Initialize the array with random values (defined from the function table)
+    void rand();

    //! Return true if NaNs are present
    inline bool NaNs() const;
@@ -491,13 +849,13 @@ public:
    inline TYPE mean() const;

    //! Return the min of all elements in a given direction
-    Array<TYPE> min( int dir ) const;
+    Array<TYPE, FUN> min( int dir ) const;

    //! Return the max of all elements in a given direction
-    Array<TYPE> max( int dir ) const;
+    Array<TYPE, FUN> max( int dir ) const;

    //! Return the sum of all elements in a given direction
-    Array<TYPE> sum( int dir ) const;
+    Array<TYPE, FUN> sum( int dir ) const;

    //! Return the smallest value
    inline TYPE min( const std::vector<size_t> &index ) const;
@@ -511,52 +869,86 @@ public:
    //! Return the mean of all elements
    inline TYPE mean( const std::vector<size_t> &index ) const;

+    //! Return the smallest value
+    inline TYPE min( const std::vector<Range<size_t>> &index ) const;
+
+    //! Return the largest value
+    inline TYPE max( const std::vector<Range<size_t>> &index ) const;
+
+    //! Return the sum of all elements
+    inline TYPE sum( const std::vector<Range<size_t>> &index ) const;
+
+    //! Return the mean of all elements
+    inline TYPE mean( const std::vector<Range<size_t>> &index ) const;
+
    //! Find all elements that match the operator
    std::vector<size_t> find(
        const TYPE &value, std::function<bool( const TYPE &, const TYPE & )> compare ) const;

-    //! Add another array
-    Array &operator+=( const Array &rhs );
-
-    //! Subtract another array
-    Array &operator-=( const Array &rhs );
-
-    //! Add a scalar
-    Array &operator+=( const TYPE &rhs );
-
-    //! Subtract a scalar
-    Array &operator-=( const TYPE &rhs );

    //! Print an array
-    void print( std::ostream& os, const std::string& name="A", const std::string& prefix="" ) const;
+    void print(
+        std::ostream &os, const std::string &name = "A", const std::string &prefix = "" ) const;

    //! Multiply two arrays
    static Array multiply( const Array &a, const Array &b );

    //! Transpose an array
-    Array<TYPE> reverseDim( ) const;
+    Array<TYPE, FUN> reverseDim() const;
+
+    //! Replicate an array a given number of times in each direction
+    Array<TYPE, FUN> repmat( const std::vector<size_t> &N ) const;

    //! Coarsen an array using the given filter
-    Array<TYPE> coarsen( const Array<TYPE>& filter ) const;
+    Array<TYPE, FUN> coarsen( const Array<TYPE, FUN> &filter ) const;

    //! Coarsen an array using the given filter
-    Array<TYPE> coarsen( const std::vector<size_t>& ratio, std::function<TYPE(const Array<TYPE>&)> filter ) const;
+    Array<TYPE, FUN> coarsen( const std::vector<size_t> &ratio,
+        std::function<TYPE( const Array<TYPE, FUN> & )> filter ) const;
+
+    /*!
+     * Perform a element-wise operation y = f(x)
+     * @param[in] fun           The function operation
+     * @param[in] x             The input array
+     */
+    static Array transform( std::function<TYPE( const TYPE & )> fun, const Array &x );
+
+    /*!
+     * Perform a element-wise operation z = f(x,y)
+     * @param[in] fun           The function operation
+     * @param[in] x             The first array
+     * @param[in] y             The second array
+     */
+    static Array transform(
+        std::function<TYPE( const TYPE &, const TYPE & )> fun, const Array &x, const Array &y );
+
+    /*!
+     * axpby operation: this = alpha*x + beta*this
+     * @param[in] alpha         alpha
+     * @param[in] x             x
+     * @param[in] beta          beta
+     */
+    void axpby( const TYPE &alpha, const Array<TYPE, FUN> &x, const TYPE &beta );

 private:
-    int d_ndim;                  // Number of dimensions in array
-    size_t d_N[ARRAY_NDIM_MAX];  // Size of each dimension
-    size_t d_length;             // Total length of array
+    ArraySize d_size;            // Size of each dimension
    TYPE *d_data;                // Raw pointer to data in array
    std::shared_ptr<TYPE> d_ptr; // Shared pointer to data in array
-    void allocate( const std::vector<size_t> &N );
+    void allocate( const ArraySize &N );
+
+public:
+    template<class TYPE2, class FUN2>
+    inline bool sizeMatch( const Array<TYPE2, FUN2> &rhs ) const
+    {
+        return d_size == rhs.d_size;
+    }

 private:
-    template<class TYPE2>
-    inline bool sizeMatch( const Array<TYPE2>& rhs ) const;
-    inline void checkSubsetIndex( const std::vector<size_t> &index ) const;
-    inline std::array<size_t, 5> getDimArray() const;
-    static inline void getSubsetArrays( const std::vector<size_t> &index,
-        std::array<size_t, 5> &first, std::array<size_t, 5> &last, std::array<size_t, 5> &N );
+    inline void checkSubsetIndex( const std::vector<Range<size_t>> &range ) const;
+    inline std::vector<Range<size_t>> convert( const std::vector<size_t> &index ) const;
+    static inline void getSubsetArrays( const std::vector<Range<size_t>> &range,
+        std::array<size_t, 5> &first, std::array<size_t, 5> &last, std::array<size_t, 5> &inc,
+        std::array<size_t, 5> &N );
 };


--- a/common/Array.hpp
+++ b/common/Array.hpp
--- a/common/FunctionTable.h
+++ b/common/FunctionTable.h
@@ -0,0 +1,81 @@
+#ifndef included_FunctionTable
+#define included_FunctionTable
+
+
+#include "common/Array.h"
+
+#include <functional>
+
+
+/*!
+ * Class FunctionTable is a serial function table class that defines
+ *   a series of operations that can be performed on the Array class.
+ *   Users can impliment additional versions of the function table that match
+ *   the interface to change the behavior of the array class.
+ */
+class FunctionTable final
+{
+public:
+    /*!
+     * Initialize the array with random values
+     * @param[in] x         The array to operate on
+     */
+    template<class TYPE, class FUN>
+    static void rand( Array<TYPE, FUN> &x );
+
+    /*!
+     * Perform a reduce operator y = f(x)
+     * @param[in] op        The function operation
+     *                      Note: the operator is a template parameter
+     *                      (compared to a std::function to improve performance)
+     * @param[in] A         The array to operate on
+     * @return              The reduction
+     */
+    template<class TYPE, class FUN, typename LAMBDA>
+    static inline TYPE reduce( LAMBDA &op, const Array<TYPE, FUN> &A );
+
+    /*!
+     * Perform a element-wise operation y = f(x)
+     * @param[in] fun       The function operation
+     *                      Note: the operator is a template parameter
+     *                      (compared to a std::function to improve performance)
+     * @param[in] x         The input array to operate on
+     * @param[out] y        The output array
+     */
+    template<class TYPE, class FUN, typename LAMBDA>
+    static inline void transform( LAMBDA &fun, const Array<TYPE, FUN> &x, Array<TYPE, FUN> &y );
+
+    /*!
+     * Perform a element-wise operation z = f(x,y)
+     * @param[in] fun       The function operation
+     *                      Note: the operator is a template parameter
+     *                      (compared to a std::function to improve performance)
+     * @param[in] x         The first array
+     * @param[in] y         The second array
+     * @param[out] z        The result
+     */
+    template<class TYPE, class FUN, typename LAMBDA>
+    static inline void transform(
+        LAMBDA &fun, const Array<TYPE, FUN> &x, const Array<TYPE, FUN> &y, Array<TYPE, FUN> &z );
+
+    /*!
+     * Multiply two arrays
+     * @param[in] a             The first array
+     * @param[in] b             The second array
+     * @param[out] c            The output array
+     */
+    template<class TYPE, class FUN>
+    static void multiply(
+        const Array<TYPE, FUN> &a, const Array<TYPE, FUN> &b, Array<TYPE, FUN> &c );
+
+
+private:
+    FunctionTable();
+
+    template<class T>
+    static inline void rand( size_t N, T *x );
+};
+
+#include "common/FunctionTable.hpp"
+
+#endif
--- a/common/FunctionTable.hpp
+++ b/common/FunctionTable.hpp
@@ -0,0 +1,116 @@
+#ifndef included_FunctionTable_hpp
+#define included_FunctionTable_hpp
+
+#include "common/FunctionTable.h"
+#include "common/Utilities.h"
+
+#include <algorithm>
+#include <cstring>
+#include <limits>
+#include <random>
+
+
+/********************************************************
+ *  Random number initialization                         *
+ ********************************************************/
+template<class TYPE, class FUN>
+void FunctionTable::rand( Array<TYPE, FUN> &x )
+{
+    FunctionTable::rand<TYPE>( x.length(), x.data() );
+}
+template<>
+inline void FunctionTable::rand<double>( size_t N, double *x )
+{
+    std::random_device rd;
+    std::mt19937 gen( rd() );
+    std::uniform_real_distribution<> dis( 0, 1 );
+    for ( size_t i = 0; i < N; i++ )
+        x[i] = dis( gen );
+}
+template<>
+inline void FunctionTable::rand<float>( size_t N, float *x )
+{
+    std::random_device rd;
+    std::mt19937 gen( rd() );
+    std::uniform_real_distribution<> dis( 0, 1 );
+    for ( size_t i = 0; i < N; i++ )
+        x[i] = dis( gen );
+}
+template<>
+inline void FunctionTable::rand<int>( size_t N, int *x )
+{
+    std::random_device rd;
+    std::mt19937 gen( rd() );
+    std::uniform_int_distribution<> dis;
+    for ( size_t i = 0; i < N; i++ )
+        x[i] = dis( gen );
+}
+
+
+/********************************************************
+ *  Reduction                                            *
+ ********************************************************/
+template<class TYPE, class FUN, typename LAMBDA>
+inline TYPE FunctionTable::reduce( LAMBDA &op, const Array<TYPE, FUN> &A )
+{
+    if ( A.length() == 0 )
+        return TYPE();
+    const TYPE *x  = A.data();
+    TYPE y         = x[0];
+    const size_t N = A.length();
+    for ( size_t i = 1; i < N; i++ )
+        y = op( x[i], y );
+    return y;
+}
+
+
+/********************************************************
+ *  Unary transformation                                 *
+ ********************************************************/
+template<class TYPE, class FUN, typename LAMBDA>
+inline void FunctionTable::transform( LAMBDA &fun, const Array<TYPE, FUN> &x, Array<TYPE, FUN> &y )
+{
+    y.resize( x.size() );
+    const size_t N = x.length();
+    for ( size_t i = 0; i < N; i++ )
+        y( i ) = fun( x( i ) );
+}
+template<class TYPE, class FUN, typename LAMBDA>
+inline void FunctionTable::transform(
+    LAMBDA &fun, const Array<TYPE, FUN> &x, const Array<TYPE, FUN> &y, Array<TYPE, FUN> &z )
+{
+    if ( !x.sizeMatch( y ) )
+        throw std::logic_error( "Sizes of x and y do not match" );
+    z.resize( x.size() );
+    const size_t N = x.length();
+    for ( size_t i = 0; i < N; i++ )
+        z( i ) = fun( x( i ), y( i ) );
+}
+
+
+/********************************************************
+ *  Multiply two arrays                                  *
+ ********************************************************/
+template<class TYPE, class FUN>
+void FunctionTable::multiply(
+    const Array<TYPE, FUN> &a, const Array<TYPE, FUN> &b, Array<TYPE, FUN> &c )
+{
+    if ( a.ndim() <= 2 && b.ndim() <= 2 ) {
+        if ( a.size( 1 ) != b.size( 0 ) )
+            throw std::logic_error( "Inner dimensions must match" );
+        c.resize( a.size( 0 ), b.size( 1 ) );
+        c.fill( 0 );
+        for ( size_t k = 0; k < b.size( 1 ); k++ ) {
+            for ( size_t j = 0; j < a.size( 1 ); j++ ) {
+                for ( size_t i = 0; i < a.size( 0 ); i++ ) {
+                    c( i, k ) += a( i, j ) * b( j, k );
+                }
+            }
+        }
+    } else {
+        throw std::logic_error( "Not finished yet" );
+    }
+}
+
+
+#endif
--- a/common/StackTrace.cpp
+++ b/common/StackTrace.cpp
--- a/common/StackTrace.h
+++ b/common/StackTrace.h
@@ -1,14 +1,11 @@
-#ifndef included_AtomicStackTrace
-#define included_AtomicStackTrace
+#ifndef included_StackTrace
+#define included_StackTrace

 #include <functional>
 #include <iostream>
-#include <stdio.h>
-#include <stdlib.h>
-#include <vector>
-#include <thread>
-#include <memory>
 #include <set>
+#include <thread>
+#include <vector>


 // Check for and include MPI
@@ -39,12 +36,16 @@ struct stack_info {
    int line;
    //! Default constructor
    stack_info() : address( nullptr ), address2( nullptr ), line( 0 ) {}
+    //! Reset the stack
+    void clear();
    //! Operator==
    bool operator==( const stack_info &rhs ) const;
    //! Operator!=
    bool operator!=( const stack_info &rhs ) const;
+    //! Get the minimum width to print the addresses
+    int getAddressWidth() const;
    //! Print the stack info
-    std::string print() const;
+    std::string print( int widthAddress = 16, int widthObject = 20, int widthFunction = 32 ) const;
    //! Compute the number of bytes needed to store the object
    size_t size() const;
    //! Pack the data to a byte array, returning a pointer to the end of the data
@@ -59,15 +60,27 @@ struct stack_info {


 struct multi_stack_info {
-    int N;
-    stack_info stack;
-    std::vector<multi_stack_info> children;
+    int N;                                  // Number of threads/processes
+    stack_info stack;                       // Current stack item
+    std::vector<multi_stack_info> children; // Children
    //! Default constructor
    multi_stack_info() : N( 0 ) {}
+    //! Construct from a simple call stack
+    explicit multi_stack_info( const std::vector<stack_info> & );
+    //! Copy constructor from a simple call stack
+    multi_stack_info &operator=( const std::vector<stack_info> & );
+    //! Reset the stack
+    void clear();
    //! Add the given stack to the multistack
-    void add( size_t N, const stack_info *stack );
+    void add( size_t len, const stack_info *stack );
    //! Print the stack info
    std::vector<std::string> print( const std::string &prefix = std::string() ) const;
+
+private:
+    void print2( const std::string &prefix, int w[3], std::vector<std::string> &text ) const;
+    int getAddressWidth() const;
+    int getObjectWidth() const;
+    int getFunctionWidth() const;
 };


@@ -110,6 +123,16 @@ multi_stack_info getAllCallStacks( );
 multi_stack_info getGlobalCallStacks();


+/*!
+ * @brief  Clean up the stack trace
+ * @details  This function modifies the stack trace to remove entries
+ *    related to acquiring the stack trace in an attempt to make it
+ *    more useful for display/users.
+ * @param[in,out] stack     The stack trace to modify
+ */
+void cleanupStackTrace( multi_stack_info &stack );
+
+
 //! Function to return the current call stack for the current thread
 std::vector<void *> backtrace();

@@ -136,8 +159,9 @@ std::string signalName( int signal );
 * Return the symbols from the current executable (not availible for all platforms)
 * @return      Returns 0 if sucessful
 */
-int getSymbols(
-    std::vector<void *> &address, std::vector<char> &type, std::vector<std::string> &obj );
+int getSymbols( std::vector<void *> &address,
+                std::vector<char> &type,
+                std::vector<std::string> &obj );


 /*!
@@ -159,14 +183,15 @@ enum class terminateType { signal, exception };

 /*!
 * Set the error handlers
- * @param[in]   Function to terminate the program: abort(msg,type)
+ * @param[in] abort     Function to terminate the program: abort(msg,type)
 */
 void setErrorHandlers( std::function<void( std::string, terminateType )> abort );


 /*!
 * Set the given signals to the handler
- * @param[in]   Function to terminate the program: abort(msg,type)
+ * @param[in] signals   Signals to handle
+ * @param[in] handler   Function to terminate the program: abort(msg,type)
 */
 void setSignals( const std::vector<int> &signals, void ( *handler )( int ) );

@@ -213,4 +238,5 @@ std::string exec( const std::string& cmd, int& exit_code );

 } // namespace StackTrace

+
 #endif
--- a/common/UnitTest.cpp
+++ b/common/UnitTest.cpp
@@ -1,37 +1,99 @@
-#include <iostream>
-#include <sstream>
-#include <vector>
-#include <string>
 #include "common/UnitTest.h"
 #include "common/Utilities.h"
+#include <cstring>
+#include <iostream>
+#include <sstream>
+#include <string>
+#include <vector>


-#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
-    // Windows
-    // Sleep is defined in milliseconds
-#else
-    // Linux
-    // usleep is defined in microseconds, create a Sleep command
-    #define Sleep(x) usleep(x*1000)
-#endif
-
+#define pout std::cout
+#define printp printf


 /********************************************************************
-*  Empty Constructor                                                *
+ *  Constructor/Destructor                                           *
 ********************************************************************/
-UnitTest::UnitTest() {
+UnitTest::UnitTest()
+{
 #ifdef USE_MPI
    comm = MPI_COMM_WORLD;
 #endif
 }
+UnitTest::~UnitTest() { reset(); }
+void UnitTest::reset()
+{
+    mutex.lock();
+    // Clear the data forcing a reallocation
+    std::vector<std::string>().swap( pass_messages );
+    std::vector<std::string>().swap( fail_messages );
+    std::vector<std::string>().swap( expected_fail_messages );
+    mutex.unlock();
+}
+
+
+/********************************************************************
+ *  Add a pass, fail, expected failure message in a thread-safe way  *
+ ********************************************************************/
+void UnitTest::passes( const std::string &in )
+{
+    mutex.lock();
+    pass_messages.push_back( in );
+    mutex.unlock();
+}
+void UnitTest::failure( const std::string &in )
+{
+    mutex.lock();
+    fail_messages.push_back( in );
+    mutex.unlock();
+}
+void UnitTest::expected_failure( const std::string &in )
+{
+    mutex.lock();
+    expected_fail_messages.push_back( in );
+    mutex.unlock();
+}


 /********************************************************************
 *  Print a global report                                            *
 *  Note: only rank 0 will print, all messages will be aggregated    *
 ********************************************************************/
-void UnitTest::report(const int level0) {
+inline std::vector<int> UnitTest::allGather( int value ) const
+{
+    int size = getSize();
+    std::vector<int> data( size, value );
+#ifdef USE_MPI
+    if ( size > 1 )
+        MPI_Allgather( &value, 1, MPI_INT, data.data(), 1, MPI_INT, comm );
+#endif
+    return data;
+}
+inline void UnitTest::barrier() const
+{
+#ifdef USE_MPI
+    if ( getSize() > 1 )
+        MPI_Barrier( comm );
+#endif
+}
+static inline void print_messages( const std::vector<std::vector<std::string>> &messages )
+{
+    if ( messages.size() > 1 ) {
+        for ( size_t i = 0; i < messages.size(); i++ ) {
+            if ( !messages[i].empty() ) {
+                printp( "     Proccessor %i:\n", static_cast<int>( i ) );
+                for ( const auto &j : messages[i] )
+                    pout << "        " << j << std::endl;
+            }
+        }
+    } else {
+        for ( const auto &j : messages[0] )
+            pout << "    " << j << std::endl;
+    }
+}
+void UnitTest::report( const int level0 ) const
+{
+    mutex.lock();
    int size = getSize();
    int rank = getRank();
    // Broadcast the print level from rank 0
@@ -43,27 +105,15 @@ void UnitTest::report(const int level0) {
    if ( level < 0 || level > 2 )
        ERROR( "Invalid print level" );
    // Perform a global all gather to get the number of failures per processor
-    std::vector<int> N_pass(size,0);
-    std::vector<int> N_fail(size,0);
-    std::vector<int> N_expected_fail(size,0);
-    int local_pass_size = (int) pass_messages.size();
-    int local_fail_size = (int) fail_messages.size();
-    int local_expected_fail_size = (int) expected_fail_messages.size();
-    if ( getSize() > 1 ) {
-        #ifdef USE_MPI
-            MPI_Allgather( &local_pass_size, 1, MPI_INT, &N_pass[0], 1, MPI_INT, comm);
-            MPI_Allgather( &local_fail_size, 1, MPI_INT, &N_fail[0], 1, MPI_INT, comm);
-            MPI_Allgather( &local_expected_fail_size, 1, MPI_INT, &N_expected_fail[0], 1, MPI_INT, comm);
-        #endif 
-    } else {
-        N_pass[0] = local_pass_size;
-        N_fail[0] = local_fail_size;
-        N_expected_fail[0] = local_expected_fail_size;
-    }
+    auto N_pass             = allGather( pass_messages.size() );
+    auto N_fail             = allGather( fail_messages.size() );
+    auto N_expected_fail    = allGather( expected_fail_messages.size() );
    int N_pass_tot          = 0;
+    int N_fail_tot          = 0;
    int N_expected_fail_tot = 0;
    for ( int i = 0; i < size; i++ ) {
        N_pass_tot += N_pass[i];
+        N_fail_tot += N_fail[i];
        N_expected_fail_tot += N_expected_fail[i];
    }
    // Send all messages to rank 0 (if needed)
@@ -71,158 +121,134 @@ void UnitTest::report(const int level0) {
    std::vector<std::vector<std::string>> fail_messages_rank( size );
    std::vector<std::vector<std::string>> expected_fail_rank( size );
    // Get the pass messages
-    if ( ( level==1 && N_pass_tot<=20 ) || level==2 ) {
-        if ( rank==0 ) {
-            // Rank 0 should receive all messages
-            for (int i=0; i<size; i++) {
-                if ( i==0 )
-                    pass_messages_rank[i] = pass_messages;
-                else if ( N_pass[i]>0 )
-                    pass_messages_rank[i] = unpack_message_stream(i,1);
-            }
-        } else if ( pass_messages.size() ) {
-            // All other ranks send their message (use non-blocking communication)
-            pack_message_stream(pass_messages,0,1);
-        }
-    }
+    if ( ( level == 1 && N_pass_tot <= 20 ) || level == 2 )
+        pass_messages_rank = UnitTest::gatherMessages( pass_messages, 1 );
    // Get the fail messages
-    if ( level==1 || level==2 ) {
-        if ( rank==0 ) {
-            // Rank 0 should receive all messages
-            for (int i=0; i<size; i++) {
-                if ( i==0 )
-                    fail_messages_rank[i] = fail_messages;
-                else if ( N_fail[i]>0 )
-                    fail_messages_rank[i] = unpack_message_stream(i,2);
-            }
-        } else if ( !fail_messages.empty() ){
-            // All other ranks send their message (use non-blocking communication)
-            pack_message_stream(fail_messages,0,2);
-        }
-    }
+    if ( level == 1 || level == 2 )
+        fail_messages_rank = UnitTest::gatherMessages( fail_messages, 2 );
    // Get the expected_fail messages
-    if ( ( level==1 && N_expected_fail_tot<=50 ) || level==2 ) {
-        if ( rank==0 ) {
-            // Rank 0 should receive all messages
-            for (int i=0; i<size; i++) {
-                if ( i==0 )
-                    expected_fail_rank[i] = expected_fail_messages;
-                else if ( N_expected_fail[i]>0 )
-                    expected_fail_rank[i] = unpack_message_stream(i,3);
-            }
-        } else if ( !expected_fail_messages.empty() ){
-            // All other ranks send their message (use non-blocking communication)
-            pack_message_stream(expected_fail_messages,0,3);
-        }
-    }
+    if ( ( level == 1 && N_expected_fail_tot <= 50 ) || level == 2 )
+        expected_fail_rank = UnitTest::gatherMessages( expected_fail_messages, 2 );
    // Print the results of all messages (only rank 0 will print)
    if ( rank == 0 ) {
-        std::cout << std::endl;
+        pout << std::endl;
        // Print the passed tests
-        std::cout << "Tests passed" << std::endl;
+        pout << "Tests passed" << std::endl;
        if ( level == 0 || ( level == 1 && N_pass_tot > 20 ) ) {
            // We want to print a summary
            if ( size > 8 ) {
                // Print 1 summary for all processors
-                std::cout << "     " << N_pass_tot << " tests passed (use report level 2 for more detail)" << std::endl;
+                printp( "     %i tests passed (use report level 2 for more detail)\n", N_pass_tot );
            } else {
                // Print a summary for each processor
                for ( int i = 0; i < size; i++ )
-                    std::cout << "     " << N_pass[i] << " tests passed (proc " << i << ") (use report level 2 for more detail)" << std::endl;
+                    printp( "     %i tests passed (proc %i) (use report level 2 for more detail)\n",
+                        N_pass[i], i );
            }
        } else {
            // We want to print all messages
-            for (int i=0; i<size; i++) {
+            for ( int i = 0; i < size; i++ )
                ASSERT( (int) pass_messages_rank[i].size() == N_pass[i] );
-                if ( N_pass[i] > 0 ) {
-                    std::cout << "     Proccessor " << i << ":" << std::endl;
-                    for (unsigned int j=0; j<pass_messages_rank[i].size(); j++)
-                        std::cout << "        " <<  pass_messages_rank[i][j] << std::endl;
+            print_messages( pass_messages_rank );
        }
-            }
-        }
-        std::cout << std::endl;
+        pout << std::endl;
        // Print the tests that failed
-        std::cout << "Tests failed" << std::endl;
+        pout << "Tests failed" << std::endl;
        if ( level == 0 ) {
            // We want to print a summary
            if ( size > 8 ) {
                // Print 1 summary for all processors
-                std::cout << "     " << N_pass_tot << " tests failed (use report level 2 for more detail)" << std::endl;
+                printp( "     %i tests failed (use report level 2 for more detail)\n", N_fail_tot );
            } else {
                // Print a summary for each processor
                for ( int i = 0; i < size; i++ )
-                    std::cout << "     " << N_fail[i] << " tests failed (proc " << i << ") (use report level 1 or 2 for more detail)" << std::endl;
+                    printp( "     %i tests failed (proc %i) (use report level 2 for more detail)\n",
+                        N_fail[i], i );
            }
        } else {
            // We want to print all messages
-            for (int i=0; i<size; i++) {
+            for ( int i = 0; i < size; i++ )
                ASSERT( (int) fail_messages_rank[i].size() == N_fail[i] );
-                if ( N_fail[i] > 0 ) {
-                    std::cout << "     Processor " << i << ":" << std::endl;
-                    for (unsigned int j=0; j<fail_messages_rank[i].size(); j++)
-                        std::cout << "        " <<  fail_messages_rank[i][j] << std::endl;
+            print_messages( fail_messages_rank );
        }
-            }
-        }
-        std::cout << std::endl;
+        pout << std::endl;
        // Print the tests that expected failed
-        std::cout << "Tests expected failed" << std::endl;
+        pout << "Tests expected failed" << std::endl;
        if ( level == 0 || ( level == 1 && N_expected_fail_tot > 50 ) ) {
            // We want to print a summary
            if ( size > 8 ) {
                // Print 1 summary for all processors
-                std::cout << "     " << N_expected_fail_tot << " tests expected failed (use report level 2 for more detail)" << std::endl;
+                printp( "     %i tests expected failed (use report level 2 for more detail)\n",
+                    N_expected_fail_tot );
            } else {
                // Print a summary for each processor
                for ( int i = 0; i < size; i++ )
-                    std::cout << "     " << N_expected_fail[i] << " tests expected failed (proc " << i << ") (use report level 1 or 2 for more detail)" << std::endl;
+                    printp( "     %i tests expected failed (proc %i) (use report level 2 for more "
+                            "detail)\n",
+                        N_expected_fail[i], i );
            }
        } else {
            // We want to print all messages
-            for (int i=0; i<size; i++) {
+            for ( int i = 0; i < size; i++ )
                ASSERT( (int) expected_fail_rank[i].size() == N_expected_fail[i] );
-                if ( N_expected_fail[i] > 0 ) {
-                    std::cout << "     Processor " << i << ":" << std::endl;
-                    for (unsigned int j=0; j<expected_fail_rank[i].size(); j++)
-                        std::cout << "        " <<  expected_fail_rank[i][j] << std::endl;
+            print_messages( expected_fail_rank );
        }
-            }
-        }
-        std::cout << std::endl;
+        pout << std::endl;
    }
    // Add a barrier to synchronize all processors (rank 0 is much slower)
-    #ifdef USE_MPI
-        if ( getSize() > 1 )
-            MPI_Barrier(comm);
-    #endif
+    barrier();
+    Utilities::sleep_ms( 10 ); // Need a brief pause to allow any printing to finish
+    mutex.unlock();
 }


+/********************************************************************
+ *  Gather the messages to rank 0                                    *
+ ********************************************************************/
+std::vector<std::vector<std::string>> UnitTest::gatherMessages(
+    const std::vector<std::string> &local_messages, int tag ) const
+{
+    const int rank = getRank();
+    const int size = getSize();
+    std::vector<std::vector<std::string>> messages( size );
+    if ( rank == 0 ) {
+        // Rank 0 should receive all messages
+        for ( int i = 0; i < size; i++ ) {
+            if ( i == 0 )
+                messages[i] = local_messages;
+            else
+                messages[i] = unpack_message_stream( i, tag );
+        }
+    } else {
+        // All other ranks send their message (use non-blocking communication)
+        pack_message_stream( local_messages, 0, tag );
+    }
+    return messages;
+}
+

 /********************************************************************
 *  Pack and send the given messages                                 *
 ********************************************************************/
-void UnitTest::pack_message_stream(const std::vector<std::string>& messages, const int rank, const int tag)
+void UnitTest::pack_message_stream(
+    const std::vector<std::string> &messages, const int rank, const int tag ) const
 {
 #ifdef USE_MPI
    // Get the size of the messages
-        int N_messages = (int) messages.size();
-        int *msg_size = new int[N_messages];
+    auto N_messages  = (int) messages.size();
+    auto *msg_size   = new int[N_messages];
    int msg_size_tot = 0;
    for ( int i = 0; i < N_messages; i++ ) {
        msg_size[i] = (int) messages[i].size();
        msg_size_tot += msg_size[i];
    }
    // Allocate space for the message stream
-        int size_data = (N_messages+1)*sizeof(int)+msg_size_tot;
-        char *data = new char[size_data];
+    size_t size_data = ( N_messages + 1 ) * sizeof( int ) + msg_size_tot;
+    auto *data       = new char[size_data];
    // Pack the message stream
-        int *tmp = (int*) data;
-        tmp[0] = N_messages;
-        for (int i=0; i<N_messages; i++)
-            tmp[i+1] = msg_size[i];
-        int k = (N_messages+1)*sizeof(int);
+    memcpy( data, &N_messages, sizeof( int ) );
+    memcpy( &data[sizeof( int )], msg_size, N_messages * sizeof( int ) );
+    size_t k = ( N_messages + 1 ) * sizeof( int );
    for ( int i = 0; i < N_messages; i++ ) {
        messages[i].copy( &data[k], msg_size[i] );
        k += msg_size[i];
@@ -235,14 +261,18 @@ void UnitTest::pack_message_stream(const std::vector<std::string>& messages, con
    MPI_Wait( &request, &status );
    delete[] data;
    delete[] msg_size;
+#else
+    NULL_USE( messages );
+    NULL_USE( rank );
+    NULL_USE( tag );
 #endif
 }


 /********************************************************************
-*  receive and unpack a message stream                              *
+ *  Receive and unpack a message stream                              *
 ********************************************************************/
-std::vector<std::string> UnitTest::unpack_message_stream(const int rank, const int tag)
+std::vector<std::string> UnitTest::unpack_message_stream( const int rank, const int tag ) const
 {
 #ifdef USE_MPI
    // Probe the message to get the message size
@@ -252,26 +282,32 @@ std::vector<std::string> UnitTest::unpack_message_stream(const int rank, const i
    MPI_Get_count( &status, MPI_BYTE, &size_data );
    ASSERT( size_data >= 0 );
    // Allocate memory to receive the data
-        char *data = new char[size_data];
+    auto *data = new char[size_data];
    // receive the data (using a non-blocking receive)
    MPI_Request request;
    MPI_Irecv( data, size_data, MPI_CHAR, rank, tag, comm, &request );
    // Wait for the communication to be received
    MPI_Wait( &request, &status );
    // Unpack the message stream
-        int *tmp = (int*) data;
-        int N_messages = tmp[0];
-        int *msg_size = &tmp[1];
+    int N_messages = 0;
+    memcpy( &N_messages, data, sizeof( int ) );
+    if ( N_messages == 0 ) {
+        delete[] data;
+        return std::vector<std::string>();
+    }
+    std::vector<int> msg_size( N_messages );
    std::vector<std::string> messages( N_messages );
+    memcpy( msg_size.data(), &data[sizeof( int )], N_messages * sizeof( int ) );
    int k = ( N_messages + 1 ) * sizeof( int );
    for ( int i = 0; i < N_messages; i++ ) {
        messages[i] = std::string( &data[k], msg_size[i] );
        k += msg_size[i];
    }
-        // Delete the temporary memory
    delete[] data;
    return messages;
 #else
+    NULL_USE( rank );
+    NULL_USE( tag );
    return std::vector<std::string>();
 #endif
 }
@@ -280,7 +316,7 @@ std::vector<std::string> UnitTest::unpack_message_stream(const int rank, const i
 /********************************************************************
 *  Other functions                                                  *
 ********************************************************************/
-int UnitTest::getRank()
+int UnitTest::getRank() const
 {
    int rank = 0;
 #ifdef USE_MPI
@@ -291,7 +327,7 @@ int UnitTest::getRank()
 #endif
    return rank;
 }
-int UnitTest::getSize()
+int UnitTest::getSize() const
 {
    int size = 1;
 #ifdef USE_MPI
@@ -302,12 +338,12 @@ int UnitTest::getSize()
 #endif
    return size;
 }
-size_t UnitTest::NumPassGlobal()
+size_t UnitTest::NumPassGlobal() const
 {
    size_t num = pass_messages.size();
 #ifdef USE_MPI
    if ( getSize() > 1 ) {
-            int send = static_cast<int>(num);
+        auto send = static_cast<int>( num );
        int sum   = 0;
        MPI_Allreduce( &send, &sum, 1, MPI_INT, MPI_SUM, comm );
        num = static_cast<size_t>( sum );
@@ -315,12 +351,12 @@ size_t UnitTest::NumPassGlobal()
 #endif
    return num;
 }
-size_t UnitTest::NumFailGlobal()
+size_t UnitTest::NumFailGlobal() const
 {
    size_t num = fail_messages.size();
 #ifdef USE_MPI
    if ( getSize() > 1 ) {
-            int send = static_cast<int>(num);
+        auto send = static_cast<int>( num );
        int sum   = 0;
        MPI_Allreduce( &send, &sum, 1, MPI_INT, MPI_SUM, comm );
        num = static_cast<size_t>( sum );
@@ -328,12 +364,12 @@ size_t UnitTest::NumFailGlobal()
 #endif
    return num;
 }
-size_t UnitTest::NumExpectedFailGlobal()
+size_t UnitTest::NumExpectedFailGlobal() const
 {
    size_t num = expected_fail_messages.size();
 #ifdef USE_MPI
    if ( getSize() > 1 ) {
-            int send = static_cast<int>(num);
+        auto send = static_cast<int>( num );
        int sum   = 0;
        MPI_Allreduce( &send, &sum, 1, MPI_INT, MPI_SUM, comm );
        num = static_cast<size_t>( sum );
@@ -341,5 +377,3 @@ size_t UnitTest::NumExpectedFailGlobal()
 #endif
    return num;
 }
-
-
--- a/common/UnitTest.h
+++ b/common/UnitTest.h
@@ -1,9 +1,10 @@
 #ifndef included_UnitTest
 #define included_UnitTest

+#include <mutex>
 #include <sstream>
-#include <vector>
 #include <string>
+#include <vector>
 #ifdef USE_MPI
 #include "mpi.h"
 #endif
@@ -27,78 +28,92 @@
 * \endcode

 */
-class UnitTest {
+class UnitTest
+{
 public:
-
    //! Constructor
    UnitTest();

-    //! Indicate a passed test
-    virtual void passes (const std::string &in) { pass_messages.push_back(in); }
+    //! Destructor
+    virtual ~UnitTest();

-    //! Indicate a failed test
-    virtual void failure (const std::string &in) { fail_messages.push_back(in); }
+    //! Indicate a passed test (thread-safe)
+    virtual void passes( const std::string &in );

-    //! Indicate an expected failed test
-    virtual void expected_failure (const std::string &in) { expected_fail_messages.push_back(in); }
+    //! Indicate a failed test (thread-safe)
+    virtual void failure( const std::string &in );
+
+    //! Indicate an expected failed test (thread-safe)
+    virtual void expected_failure( const std::string &in );

    //! Return the number of passed tests locally
-    virtual size_t NumPassLocal () { return pass_messages.size(); }
+    virtual size_t NumPassLocal() const { return pass_messages.size(); }

    //! Return the number of failed tests locally
-    virtual size_t NumFailLocal () { return fail_messages.size(); }
+    virtual size_t NumFailLocal() const { return fail_messages.size(); }

    //! Return the number of expected failed tests locally
-    virtual size_t NumExpectedFailLocal () { return expected_fail_messages.size(); }
+    virtual size_t NumExpectedFailLocal() const { return expected_fail_messages.size(); }

    //! Return the number of passed tests locally
-    virtual size_t NumPassGlobal ();
+    virtual size_t NumPassGlobal() const;

    //! Return the number of failed tests locally
-    virtual size_t NumFailGlobal ();
+    virtual size_t NumFailGlobal() const;

    //! Return the number of expected failed tests locally
-    virtual size_t NumExpectedFailGlobal ();
+    virtual size_t NumExpectedFailGlobal() const;

    //! Return the rank of the current processor
-    int getRank ();
+    int getRank() const;

    //! Return the number of processors
-    int getSize ();
+    int getSize() const;

    /*!
     * Print a report of the passed and failed tests.
     * Note: This is a blocking call that all processors must execute together.
-     * Note: Only rank 0 will print the messages (this is necessary as other ranks may not be able to print correctly).
+     * Note: Only rank 0 will print the messages (this is necessary as other ranks may not be able
+     * to print correctly).
     * @param level     Optional integer specifying the level of reporting (default: 1)
     *                  0: Report the number of tests passed, failed, and expected failures.
-     *                  1: Report the number of passed tests (if <=20) or the number passed otherwise,
-     *                     report all failures,
-     *                     report the number of expected failed tests (if <=50) or the number passed otherwise.
+     *                  1: Report the number of passed tests (if <=20) or the number passed
+     *                     otherwise, report all failures, report the number of expected
+     *                     failed tests (if <=50) or the number passed otherwise.
     *                  2: Report all passed, failed, and expected failed tests.
     */
-     virtual void report(const int level=1);
+    virtual void report( const int level = 1 ) const;
+
+    //! Clear the messages
+    void reset();

 protected:
    std::vector<std::string> pass_messages;
    std::vector<std::string> fail_messages;
    std::vector<std::string> expected_fail_messages;
+    mutable std::mutex mutex;
 #ifdef USE_MPI
    MPI_Comm comm;
 #endif

 private:
    // Make the copy constructor private
-    UnitTest(const UnitTest& p) {}
+    UnitTest( const UnitTest & ) {}

    // Function to pack the messages into a single data stream and send to the given processor
    // Note: This function does not return until the message stream has been sent
-    void pack_message_stream(const std::vector<std::string>& messages, const int rank, const int tag);
+    void pack_message_stream(
+        const std::vector<std::string> &messages, const int rank, const int tag ) const;

    // Function to unpack the messages from a single data stream
    // Note: This function does not return until the message stream has been received
-    std::vector<std::string> unpack_message_stream(const int rank, const int tag);
+    std::vector<std::string> unpack_message_stream( const int rank, const int tag ) const;

+    // Helper functions
+    inline void barrier() const;
+    inline std::vector<int> allGather( int value ) const;
+    inline std::vector<std::vector<std::string>> gatherMessages(
+        const std::vector<std::string> &local_messages, int tag ) const;
 };


--- a/common/Utilities.h
+++ b/common/Utilities.h
@@ -1,19 +1,19 @@
 #ifndef included_Utilities
 #define included_Utilities

+#include <chrono>
+#include <cstdarg>
+#include <iostream>
+#include <mutex>
 #include <stdio.h>
 #include <stdlib.h>
-#include <iostream>
+#include <sys/stat.h>
+#include <thread>
 #include <vector>


+namespace Utilities {

-/*!
- * Utilities is a Singleton class containing basic routines for error 
- * reporting, file manipulations, etc.  Included are a set of \ref Macros "macros" that are commonly used.
- */
-namespace Utilities
-{

 /*!
 * Aborts the run after printing an error message with file and
@@ -33,6 +33,7 @@ namespace Utilities
 //! Function to set the error handlers
 void setErrorHandlers();

+
 /*!
 * Function to get the memory availible.
 * This function will return the total memory availible
@@ -42,6 +43,7 @@ namespace Utilities
 */
 size_t getSystemMemory();

+
 /*!
 * Function to get the memory usage.
 * This function will return the total memory used by the application.
@@ -55,9 +57,29 @@ namespace Utilities
 //! Function to get an arbitrary point in time
 double time();

+
 //! Function to get the resolution of time
 double tick();

+
+//! std::string version of sprintf
+inline std::string stringf( const char *format, ... );
+
+
+/*!
+ * Sleep for X ms
+ * @param N         Time to sleep (ms)
+ */
+inline void sleep_ms( int N ) { std::this_thread::sleep_for( std::chrono::milliseconds( N ) ); }
+
+
+/*!
+ * Sleep for X s
+ * @param N         Time to sleep (s)
+ */
+inline void sleep_s( int N ) { std::this_thread::sleep_for( std::chrono::seconds( N ) ); }
+
+
 //! Factor a number into it's prime factors
 std::vector<int> factor(size_t number);

@@ -69,6 +91,17 @@ namespace Utilities

 #include "common/UtilityMacros.h"

+
+// stringf
+inline std::string Utilities::stringf( const char *format, ... )
+{
+    va_list ap;
+    va_start( ap, format );
+    char tmp[4096];
+    vsprintf( tmp, format, ap );
+    va_end( ap );
+    return std::string( tmp );
+}
+
+
 #endif
-
-
--- a/common/UtilityMacros.h
+++ b/common/UtilityMacros.h
@@ -22,11 +22,17 @@
 *  \details  A statement that does nothing, for insure++ make it something
 * more complex than a simple C null statement to avoid a warning.
 */
+#ifndef NULL_STATEMENT
 #ifdef __INSURE__
-    #define NULL_STATEMENT do{if(0) int nullstatement=0 }}while(0)
+#define NULL_STATEMENT            \
+    do {                          \
+        if ( 0 )                  \
+            int nullstatement = 0 \
+    } while ( 0 )
 #else
 #define NULL_STATEMENT
 #endif
+#endif


 /*! \def NULL_USE(variable)
@@ -34,9 +40,15 @@
 *  \details  A null use of a variable, use to avoid GNU compiler warnings about unused variables.
 *  \param variable  Variable to pretend to use
 */
-#define NULL_USE(variable) do {                         \
-    if(0) {char *temp = (char *)&variable; temp++;}     \
+#ifndef NULL_USE
+#define NULL_USE( variable )                \
+    do {                                    \
+        if ( 0 ) {                          \
+            auto temp = (char *) &variable; \
+            temp++;                         \
+        }                                   \
    } while ( 0 )
+#endif


 /*! \def ERROR(MSG)
@@ -46,7 +58,8 @@
 *     line number of the abort are also printed.
 *  \param MSG  Error message to print
 */
-#define ERROR(MSG) do {                                 \
+#define ERROR(MSG)                                                \
+    do {                                                          \
        ::Utilities::abort( MSG, __FILE__, __LINE__ );            \
    } while ( 0 )

@@ -56,10 +69,12 @@
 *  \details Print a warning without exit.  Print file and line number of the warning.
 *  \param MSG  Warning message to print
 */
-#define WARNING(MSG) do {                                           \
+#define WARNING(MSG)                                                    \
+    do {                                                                \
        std::stringstream tboxos;                                       \
        tboxos << MSG << std::ends;                                     \
-    printf("WARNING: %s\n   Warning called in %s on line %i\n",tboxos.str().c_str(),__FILE__,__LINE__); \
+        printf("WARNING: %s\n   Warning called in %s on line %i\n",     \
+            tboxos.str().c_str(),__FILE__,__LINE__);                    \
    }while(0)


@@ -71,7 +86,8 @@
 *     The file and line number of the abort are printed along with the stack trace (if availible).
 *  \param EXP  Expression to evaluate
 */
-#define ASSERT(EXP) do {                                            \
+#define ASSERT(EXP)                                                     \
+    do {                                                                \
        if ( !(EXP) ) {                                                 \
            std::stringstream tboxos;                                   \
            tboxos << "Failed assertion: " << #EXP << std::ends;        \
@@ -99,7 +115,6 @@
 }while(0)


-
 /**
 * Macro for use when assertions are to be included
 * only when debugging.
@@ -118,6 +133,49 @@
 #endif


+/*! \def DISABLE_WARNINGS
+ *  \brief Reenable warnings
+ *  \details This will re-enable warnings after a call to DIASABLE_WARNINGS
+ */
+/*! \def ENABLE_WARNINGS
+ *  \brief Supress all warnings
+ *  \details This will start to supress all compile warnings.
+ *      Be sure to follow with ENABLE_WARNINGS
+ */
+// clang-format off
+#ifdef DISABLE_WARNINGS
+    // Macros previously defined
+#elif defined( USING_MSVC )
+    #define DISABLE_WARNINGS __pragma( warning( push, 0 ) )
+    #define ENABLE_WARNINGS __pragma( warning( pop ) )
+#elif defined( USING_CLANG )
+    #define DISABLE_WARNINGS                                                \
+        _Pragma( "clang diagnostic push" ) _Pragma( "clang diagnostic ignored \"-Wall\"" ) \
+        _Pragma( "clang diagnostic ignored \"-Wextra\"" )                   \
+        _Pragma( "clang diagnostic ignored \"-Wunused-private-field\"" )    \
+        _Pragma( "clang diagnostic ignored \"-Wmismatched-new-delete\"" )
+    #define ENABLE_WARNINGS _Pragma( "clang diagnostic pop" )
+#elif defined( USING_GCC )
+    // Note: We cannot disable the -Wliteral-suffix message with this macro because the
+    // pragma command cannot suppress warnings from the C++ preprocessor.  See gcc bug #53431.
+    #define DISABLE_WARNINGS                                                \
+        _Pragma( "GCC diagnostic push" ) _Pragma( "GCC diagnostic ignored \"-Wall\"" ) \
+        _Pragma( "GCC diagnostic ignored \"-Wextra\"" )                     \
+        _Pragma( "GCC diagnostic ignored \"-Wpragmas\"" )                     \
+        _Pragma( "GCC diagnostic ignored \"-Wunused-local-typedefs\"" )     \
+        _Pragma( "GCC diagnostic ignored \"-Woverloaded-virtual\"" )        \
+        _Pragma( "GCC diagnostic ignored \"-Wunused-parameter\"" )          \
+        _Pragma( "GCC diagnostic ignored \"-Warray-bounds\"" )              \
+        _Pragma( "GCC diagnostic ignored \"-Wterminate\"" )
+    #define ENABLE_WARNINGS _Pragma( "GCC diagnostic pop" )
+#else
+    #define DISABLE_WARNINGS
+    #define ENABLE_WARNINGS
+#endif
+// clang-format on
+
+
+
 /*! @} */


--- a/tests/lbpm_color_simulator.h
+++ b/tests/lbpm_color_simulator.h
@@ -9,9 +9,24 @@
 #define ANALYSIS_INTERVAL 1000
 #define BLOBID_INTERVAL 1000

-enum AnalysisType{ AnalyzeNone=0, IdentifyBlobs=0x01, CopyPhaseIndicator=0x02, 
+
+enum class AnalysisType : uint64_t { AnalyzeNone=0, IdentifyBlobs=0x01, CopyPhaseIndicator=0x02, 
    CopySimState=0x04, ComputeAverages=0x08, CreateRestart=0x10, WriteVis=0x20 };

+AnalysisType& operator |=(AnalysisType &lhs, AnalysisType rhs)  
+{
+    lhs = static_cast<AnalysisType> (
+        static_cast<std::underlying_type<AnalysisType>::type>(lhs) |
+        static_cast<std::underlying_type<AnalysisType>::type>(rhs)           
+    );
+    return lhs;
+}
+bool matches( AnalysisType x, AnalysisType y )
+{
+    return static_cast<std::underlying_type<AnalysisType>::type>(x) &
+        static_cast<std::underlying_type<AnalysisType>::type>(y) != 0;
+}
+

 template<class TYPE>
 void DeleteArray( const TYPE *p )
@@ -30,7 +45,7 @@ struct AnalysisWaitIdStruct {


 // Helper class to write the restart file from a seperate thread
-class WriteRestartWorkItem: public ThreadPool::WorkItem
+class WriteRestartWorkItem: public ThreadPool::WorkItemRet<void>
 {
 public:
    WriteRestartWorkItem( const char* filename_, std::shared_ptr<double> cDen_,
@@ -41,7 +56,6 @@ public:
        WriteCheckpoint(filename,cDen.get(),cfq.get(),N);
        PROFILE_STOP("Save Checkpoint",1);
    };
-    virtual bool has_result() const { return false; }
 private:
    WriteRestartWorkItem();
    const char* filename;
@@ -54,7 +68,7 @@ private:
 static const std::string id_map_filename = "lbpm_id_map.txt";
 typedef std::shared_ptr<std::pair<int,IntArray> > BlobIDstruct;
 typedef std::shared_ptr<std::vector<BlobIDType> > BlobIDList;
-class BlobIdentificationWorkItem1: public ThreadPool::WorkItem
+class BlobIdentificationWorkItem1: public ThreadPool::WorkItemRet<void>
 {
 public:
    BlobIdentificationWorkItem1( int timestep_, int Nx_, int Ny_, int Nz_, const RankInfoStruct& rank_info_, 
@@ -75,7 +89,6 @@ public:
        new_index->first = ComputeGlobalBlobIDs(Nx-2,Ny-2,Nz-2,rank_info,*phase,dist,vF,vS,ids,newcomm);
        PROFILE_STOP("Identify blobs",1);
    }
-    virtual bool has_result() const { return false; }
 private:
    BlobIdentificationWorkItem1();
    int timestep;
@@ -87,7 +100,7 @@ private:
    BlobIDList new_list;
    MPI_Comm newcomm;
 };
-class BlobIdentificationWorkItem2: public ThreadPool::WorkItem
+class BlobIdentificationWorkItem2: public ThreadPool::WorkItemRet<void>
 {
 public:
    BlobIdentificationWorkItem2( int timestep_, int Nx_, int Ny_, int Nz_, const RankInfoStruct& rank_info_, 
@@ -122,7 +135,6 @@ public:
        }
        PROFILE_STOP("Identify blobs maps",1);
    }
-    virtual bool has_result() const { return false; }
 private:
    BlobIdentificationWorkItem2();
    int timestep;
@@ -137,7 +149,7 @@ private:


 // Helper class to write the vis file from a thread
-class WriteVisWorkItem: public ThreadPool::WorkItem
+class WriteVisWorkItem: public ThreadPool::WorkItemRet<void>
 {
 public:
    WriteVisWorkItem( int timestep_, std::vector<IO::MeshDataStruct>& visData_,
@@ -164,7 +176,6 @@ public:
        IO::writeData( timestep, visData, newcomm );
        PROFILE_STOP("Save Vis",1);
    };
-    virtual bool has_result() const { return false; }
 private:
    WriteVisWorkItem();
    int timestep;
@@ -177,7 +188,7 @@ private:

 // Helper class to run the analysis from within a thread
 // Note: Averages will be modified after the constructor is called
-class AnalysisWorkItem: public ThreadPool::WorkItem
+class AnalysisWorkItem: public ThreadPool::WorkItemRet<void>
 {
 public:
    AnalysisWorkItem( AnalysisType type_, int timestep_, TwoPhase& Averages_, 
@@ -191,10 +202,10 @@ public:
        Averages.Label_NWP_map = *id_list;
        Averages.NumberComponents_WP = 1;
        Averages.Label_WP.fill(0.0);
-        if ( (type&CopyPhaseIndicator) != 0 ) {
+        if ( matches(type,AnalysisType::CopyPhaseIndicator) ) {
            // Averages.ColorToSignedDistance(beta,Averages.Phase,Averages.Phase_tplus);
        }
-        if ( (type&ComputeAverages) != 0 ) {
+        if ( matches(type,AnalysisType::ComputeAverages) ) {
            PROFILE_START("Compute dist",1);
            Averages.Initialize();
            Averages.ComputeDelPhi();
@@ -212,7 +223,6 @@ public:
            PROFILE_STOP("Compute dist",1);
        }
    }
-    virtual bool has_result() const { return false; }
 private:
    AnalysisWorkItem();
    AnalysisType type;
@@ -223,6 +233,7 @@ private:
    double beta;
 };

+
 // Function to start the analysis
 void run_analysis( int timestep, int restart_interval, 
    const RankInfoStruct& rank_info, ScaLBL_Communicator &ScaLBL_Comm, TwoPhase& Averages,
@@ -236,46 +247,45 @@ void run_analysis( int timestep, int restart_interval,
    int N = Nx*Ny*Nz;

    // Determin the analysis we want to perform
-    AnalysisType type = AnalyzeNone;
+    AnalysisType type = AnalysisType::AnalyzeNone;
    if ( timestep%ANALYSIS_INTERVAL + 5 == ANALYSIS_INTERVAL ) {
        // Copy the phase indicator field for the earlier timestep
-        type = static_cast<AnalysisType>( type | CopyPhaseIndicator );
+        type |= AnalysisType::CopyPhaseIndicator;
    }
    if ( timestep%BLOBID_INTERVAL == 0 ) {
        // Identify blobs and update global ids in time
-        type = static_cast<AnalysisType>( type | IdentifyBlobs );
+        type |= AnalysisType::IdentifyBlobs;
    }
    /*#ifdef USE_CUDA
        if ( tpool.getQueueSize()<=3 && tpool.getNumThreads()>0 && timestep%50==0 ) {
            // Keep a few blob identifications queued up to keep the processors busy,
            // allowing us to track the blobs as fast as possible
            // Add more detailed estimates of the update frequency required to track blobs
-            type = static_cast<AnalysisType>( type | IdentifyBlobs );
+            type |= AnalysisType::IdentifyBlobs;
        }
-    #endif
-    */
+    #endif */
    if ( timestep%ANALYSIS_INTERVAL == 0 ) {
        // Copy the averages to the CPU (and identify blobs)
-        type = static_cast<AnalysisType>( type | CopySimState );
-        type = static_cast<AnalysisType>( type | IdentifyBlobs );
+        type |= AnalysisType::CopySimState;
+        type |= AnalysisType::IdentifyBlobs;
    }
    if ( timestep%ANALYSIS_INTERVAL == 5 ) {
        // Run the analysis
-        type = static_cast<AnalysisType>( type | ComputeAverages );
+        type |= AnalysisType::ComputeAverages;
    }
    if (timestep%restart_interval == 0) {
        // Write the restart file
-        type = static_cast<AnalysisType>( type | CreateRestart );
+        type |= AnalysisType::CreateRestart;
    }
    if (timestep%restart_interval == 0) {
        // Write the visualization data
-        type = static_cast<AnalysisType>( type | WriteVis );
-        type = static_cast<AnalysisType>( type | CopySimState );
-        type = static_cast<AnalysisType>( type | IdentifyBlobs );
+        type |= AnalysisType::WriteVis;
+        type |= AnalysisType::CopySimState;
+        type |= AnalysisType::IdentifyBlobs;
    }
    
    // Return if we are not doing anything
-    if ( type == AnalyzeNone )
+    if ( type == AnalysisType::AnalyzeNone )
        return;

    PROFILE_START("start_analysis");
@@ -284,21 +294,23 @@ void run_analysis( int timestep, int restart_interval,
    ScaLBL_DeviceBarrier();
    PROFILE_START("Copy data to host",1);
    std::shared_ptr<DoubleArray> phase;
-    if ( (type&CopyPhaseIndicator)!=0 || (type&ComputeAverages)!=0 ||
-         (type&CopySimState)!=0 || (type&IdentifyBlobs)!=0 )
+    if ( matches(type,AnalysisType::CopyPhaseIndicator) ||
+         matches(type,AnalysisType::ComputeAverages) ||
+         matches(type,AnalysisType::CopySimState) || 
+         matches(type,AnalysisType::IdentifyBlobs) )
    {
        phase = std::shared_ptr<DoubleArray>(new DoubleArray(Nx,Ny,Nz));
        ScaLBL_CopyToHost(phase->data(),Phi,N*sizeof(double));
    }
-    if ( (type&CopyPhaseIndicator)!=0 ) {
+    if ( matches(type,AnalysisType::CopyPhaseIndicator) ) {
        memcpy(Averages.Phase_tplus.data(),phase->data(),N*sizeof(double));
        //Averages.ColorToSignedDistance(beta,Averages.Phase,Averages.Phase_tplus);
    }
-    if ( (type&ComputeAverages)!=0 ) {
+    if ( matches(type,AnalysisType::ComputeAverages) ) {
        memcpy(Averages.Phase_tminus.data(),phase->data(),N*sizeof(double));
        //Averages.ColorToSignedDistance(beta,Averages.Phase,Averages.Phase_tminus);
    }
-    if ( (type&CopySimState) != 0 ) {
+    if ( matches(type,AnalysisType::CopySimState) ) {
        // Copy the members of Averages to the cpu (phase was copied above)
        // Wait 
        PROFILE_START("Copy-Pressure",1);
@@ -319,7 +331,7 @@ void run_analysis( int timestep, int restart_interval,
        PROFILE_STOP("Copy-State",1);
    }
    std::shared_ptr<double> cDen, cfq;
-    if ( (type&CreateRestart) != 0 ) {
+    if ( matches(type,AnalysisType::CreateRestart) ) {
        // Copy restart data to the CPU
        cDen = std::shared_ptr<double>(new double[2*Np],DeleteArray<double>);
        cfq = std::shared_ptr<double>(new double[19*Np],DeleteArray<double>);
@@ -329,14 +341,14 @@ void run_analysis( int timestep, int restart_interval,
    PROFILE_STOP("Copy data to host",1);

    // Spawn threads to do blob identification work
-    if ( (type&IdentifyBlobs)!=0 ) {
+    if ( matches(type,AnalysisType::IdentifyBlobs) ) {
        BlobIDstruct new_index(new std::pair<int,IntArray>(0,IntArray()));
        BlobIDstruct new_ids(new std::pair<int,IntArray>(0,IntArray()));
        BlobIDList new_list(new std::vector<BlobIDType>());
-        ThreadPool::WorkItem *work1 = new BlobIdentificationWorkItem1(timestep,
-            Nx,Ny,Nz,rank_info,phase,Averages.SDs,last_ids,new_index,new_ids,new_list);
-        ThreadPool::WorkItem *work2 = new BlobIdentificationWorkItem2(timestep,
-            Nx,Ny,Nz,rank_info,phase,Averages.SDs,last_ids,new_index,new_ids,new_list);
+        auto work1 = new BlobIdentificationWorkItem1(timestep,Nx,Ny,Nz,rank_info,
+            phase,Averages.SDs,last_ids,new_index,new_ids,new_list);
+        auto work2 = new BlobIdentificationWorkItem2(timestep,Nx,Ny,Nz,rank_info,
+            phase,Averages.SDs,last_ids,new_index,new_ids,new_list);
        work1->add_dependency(wait.blobID);
        work2->add_dependency(tpool.add_work(work1));
        wait.blobID = tpool.add_work(work2);
@@ -346,9 +358,8 @@ void run_analysis( int timestep, int restart_interval,
    }

    // Spawn threads to do the analysis work
-    if ( (type&ComputeAverages) != 0 ) {
-        ThreadPool::WorkItem *work = new AnalysisWorkItem(
-            type,timestep,Averages,last_index,last_id_map,beta);
+    if ( matches(type,AnalysisType::ComputeAverages) ) {
+        auto work = new AnalysisWorkItem(type,timestep,Averages,last_index,last_id_map,beta);
        work->add_dependency(wait.blobID);
        work->add_dependency(wait.analysis);
        work->add_dependency(wait.vis);     // Make sure we are done using analysis before modifying
@@ -356,15 +367,15 @@ void run_analysis( int timestep, int restart_interval,
    }

    // Spawn a thread to write the restart file
-    if ( (type&CreateRestart) != 0 ) {
+    if ( matches(type,AnalysisType::CreateRestart) ) {
        int rank = MPI_WORLD_RANK();
-        //if (pBC) {
-            //err = fabs(sat_w - sat_w_previous);
-            //sat_w_previous = sat_w;
-	  //if (rank==0){
-	  // printf("Timestep %i: change in saturation since last checkpoint is %f \n",timestep,err);
-	  // }
-	  // }
+        /* if (pBC) {
+            err = fabs(sat_w - sat_w_previous);
+            sat_w_previous = sat_w;
+            if (rank==0){
+               printf("Timestep %i: change in saturation since last checkpoint is %f \n",timestep,err);
+           }
+        } */
        // Wait for previous restart files to finish writing (not necessary, but helps to ensure memory usage is limited)
        tpool.wait(wait.restart);
        // Retain the timestep associated with the restart files
@@ -374,17 +385,17 @@ void run_analysis( int timestep, int restart_interval,
            fclose(Rst);
        }
        // Write the restart file (using a seperate thread)
-        WriteRestartWorkItem *work = new WriteRestartWorkItem(LocalRestartFile,cDen,cfq,Np);
+        auto work = new WriteRestartWorkItem(LocalRestartFile,cDen,cfq,Np);
        work->add_dependency(wait.restart);
        wait.restart = tpool.add_work(work);
    }

    // Save the results for visualization
-    if ( (type&CreateRestart) != 0 ) {
+    if ( matches(type,AnalysisType::CreateRestart) ) {
        // Wait for previous restart files to finish writing (not necessary, but helps to ensure memory usage is limited)
        tpool.wait(wait.vis);
        // Write the vis files
-        ThreadPool::WorkItem *work = new WriteVisWorkItem( timestep, visData, Averages, fillData );
+        auto work = new WriteVisWorkItem( timestep, visData, Averages, fillData );
        work->add_dependency(wait.blobID);
        work->add_dependency(wait.analysis);
        work->add_dependency(wait.vis);
--- a/threadpool/atomic_helpers.cpp
+++ b/threadpool/atomic_helpers.cpp
@@ -27,4 +27,3 @@ int atomic_pthread_lock_initialized = create_atomic_pthread_lock();

 } // AtomicOperations namespace

-
--- a/threadpool/atomic_helpers.h
+++ b/threadpool/atomic_helpers.h
@@ -5,7 +5,6 @@
 #include <stdint.h>
 #include <stdio.h>
 #include <typeinfo>
-#include <stdexcept>

 // Choose the OS
 #if defined( WIN32 ) || defined( _WIN32 ) || defined( WIN64 ) || defined( _WIN64 )
@@ -89,6 +88,16 @@ inline int32_atomic atomic_get( const int32_atomic volatile *x );
 */
 inline int64_atomic atomic_get( const int64_atomic volatile *x );

+
+/**
+ * \brief Get the value
+ * \details Read the data in x
+ * \param[in] x     The pointer to the value to get
+ */
+template<class TYPE>
+inline TYPE *atomic_get( volatile TYPE **x );
+
+
 /**
 * \brief Set the value
 * \details Set the data in x to y (*x=y)
@@ -185,9 +194,8 @@ inline bool atomic_compare_and_swap( void *volatile *v, void *x, void *y );
 * \brief Fetch the current value and "and" with given value
 * \details Perform *v = (*v) & x, returning the previous value
 * \return Returns the previous value before the "and" operation
- * \param[in] v     The pointer to the value to check and swap
- * \param[in] x     The value to compare
- * \param[in] y     The value to swap iff *v==x
+ * \param[in] v     The pointer to the value to check and and
+ * \param[in] x     The value to and
 */
 inline int32_atomic atomic_fetch_and_and( int32_atomic volatile *v, int32_atomic x );

@@ -195,9 +203,8 @@ inline int32_atomic atomic_fetch_and_and( int32_atomic volatile *v, int32_atomic
 * \brief Fetch the current value and "and" with given value
 * \details Perform *v = (*v) & x, returning the previous value
 * \return Returns the previous value before the "and" operation
- * \param[in] v     The pointer to the value to check and swap
- * \param[in] x     The value to compare
- * \param[in] y     The value to swap iff *v==x
+ * \param[in] v     The pointer to the value to check and and
+ * \param[in] x     The value to and
 */
 inline int64_atomic atomic_fetch_and_and( int64_atomic volatile *v, int64_atomic x );

@@ -205,9 +212,8 @@ inline int64_atomic atomic_fetch_and_and( int64_atomic volatile *v, int64_atomic
 * \brief Fetch the current value and "or" with given value
 * \details Perform *v = (*v) | x, returning the previous value
 * \return Returns the previous value before the "and" operation
- * \param[in] v     The pointer to the value to check and swap
- * \param[in] x     The value to compare
- * \param[in] y     The value to swap iff *v==x
+ * \param[in] v     The pointer to the value to check and or
+ * \param[in] x     The value to or
 */
 inline int32_atomic atomic_fetch_and_or( int32_atomic volatile *v, int32_atomic x );

@@ -216,13 +222,12 @@ inline int32_atomic atomic_fetch_and_or( int32_atomic volatile *v, int32_atomic
 * \details Perform *v = (*v) | x, returning the previous value
 * \return Returns the previous value before the "and" operation
 * \param[in] v     The pointer to the value to check and swap
- * \param[in] x     The value to compare
- * \param[in] y     The value to swap iff *v==x
+ * \param[in] v     The pointer to the value to check and or
+ * \param[in] x     The value to or
 */
 inline int64_atomic atomic_fetch_and_or( int64_atomic volatile *v, int64_atomic x );


-
 /**
 * \brief Class to store a pool of objects
 * \details This class stores a pool of objects that can be added/removed in a thread-safe way
@@ -261,6 +266,7 @@ class pool
        while ( !atomic_compare_and_swap( (void *volatile *) &d_data[i], nullptr, ptr ) )
            i = ( i + 1 ) % N_MAX;
    }
+
 private:
    volatile TYPE **d_data;
    pool( const pool &rhs );
@@ -323,10 +329,24 @@ inline int64_atomic atomic_decrement( int64_atomic volatile *x )
 {
    return OSAtomicDecrement64Barrier( x );
 }
-int32_atomic atomic_fetch_and_or( int32_atomic volatile *v, int32_atomic x ) { return OSAtomicOr32Orig( x, (volatile uint32_t *) v ); }
-int32_atomic atomic_fetch_and_and( int32_atomic volatile *v, int32_atomic x ) { return OSAtomicAnd32Orig( x, (volatile uint32_t *) v); }
-int64_atomic atomic_fetch_and_or( int64_atomic volatile *v, int64_atomic x ) { throw std::logic_error("Not availible for this OS"); return 0; }
-int64_atomic atomic_fetch_and_and( int64_atomic volatile *v, int64_atomic x ) { throw std::logic_error("Not availible for this OS"); return 0; }
+int32_atomic atomic_fetch_and_or( int32_atomic volatile *v, int32_atomic x )
+{
+    return OSAtomicOr32Orig( x, (volatile uint32_t *) v );
+}
+int32_atomic atomic_fetch_and_and( int32_atomic volatile *v, int32_atomic x )
+{
+    return OSAtomicAnd32Orig( x, (volatile uint32_t *) v );
+}
+int64_atomic atomic_fetch_and_or( int64_atomic volatile *v, int64_atomic x )
+{
+    throw std::logic_error( "Not availible for this OS" );
+    return 0;
+}
+int64_atomic atomic_fetch_and_and( int64_atomic volatile *v, int64_atomic x )
+{
+    throw std::logic_error( "Not availible for this OS" );
+    return 0;
+}
 inline int32_atomic atomic_add( int32_atomic volatile *x, int32_atomic y )
 {
    return OSAtomicAdd32Barrier( y, x );
@@ -352,10 +372,22 @@ int32_atomic atomic_increment( int32_atomic volatile *x ) { return __sync_add_an
 int64_atomic atomic_increment( int64_atomic volatile *x ) { return __sync_add_and_fetch( x, 1 ); }
 int32_atomic atomic_decrement( int32_atomic volatile *x ) { return __sync_sub_and_fetch( x, 1 ); }
 int64_atomic atomic_decrement( int64_atomic volatile *x ) { return __sync_sub_and_fetch( x, 1 ); }
-int32_atomic atomic_fetch_and_or( int32_atomic volatile *v, int32_atomic x ) { return __sync_fetch_and_or( v, x ); }
-int64_atomic atomic_fetch_and_or( int64_atomic volatile *v, int64_atomic x ) { return __sync_fetch_and_or( v, x ); }
-int32_atomic atomic_fetch_and_and( int32_atomic volatile *v, int32_atomic x ) { return __sync_fetch_and_and( v, x ); }
-int64_atomic atomic_fetch_and_and( int64_atomic volatile *v, int64_atomic x ) { return __sync_fetch_and_and( v, x ); }
+int32_atomic atomic_fetch_and_or( int32_atomic volatile *v, int32_atomic x )
+{
+    return __sync_fetch_and_or( v, x );
+}
+int64_atomic atomic_fetch_and_or( int64_atomic volatile *v, int64_atomic x )
+{
+    return __sync_fetch_and_or( v, x );
+}
+int32_atomic atomic_fetch_and_and( int32_atomic volatile *v, int32_atomic x )
+{
+    return __sync_fetch_and_and( v, x );
+}
+int64_atomic atomic_fetch_and_and( int64_atomic volatile *v, int64_atomic x )
+{
+    return __sync_fetch_and_and( v, x );
+}
 inline int32_atomic atomic_add( int32_atomic volatile *x, int32_atomic y )
 {
    return __sync_add_and_fetch( x, y );
@@ -459,31 +491,44 @@ inline int64_atomic atomic_get( const int64_atomic volatile *x )
 {
    return atomic_add( const_cast<int64_atomic volatile *>( x ), 0 );
 }
+template<class TYPE>
+inline TYPE *atomic_get( volatile TYPE **x )
+{
+    return reinterpret_cast<TYPE *>(
+        atomic_add( reinterpret_cast<int64_atomic volatile *>( x ), 0 ) );
+}
 inline void atomic_set( int32_atomic volatile *x, int32_atomic y )
 {
    int32_atomic tmp = *x;
-    while ( !atomic_compare_and_swap( x, tmp, y ) ) { tmp = *x; }
+    while ( !atomic_compare_and_swap( x, tmp, y ) ) {
+        tmp = *x;
+    }
 }
 inline void atomic_set( int64_atomic volatile *x, int64_atomic y )
 {
    int64_atomic tmp = *x;
-    while ( !atomic_compare_and_swap( x, tmp, y ) ) { tmp = *x; }
+    while ( !atomic_compare_and_swap( x, tmp, y ) ) {
+        tmp = *x;
+    }
 }
 inline void atomic_swap( int32_atomic volatile *x, int32_atomic *y )
 {
    int32_atomic tmp = *x;
-    while ( !atomic_compare_and_swap( x, tmp, *y ) ) { tmp = *x; }
+    while ( !atomic_compare_and_swap( x, tmp, *y ) ) {
+        tmp = *x;
+    }
    *y = tmp;
 }
 inline void atomic_swap( int64_atomic volatile *x, int64_atomic *y )
 {
    int64_atomic tmp = *x;
-    while ( !atomic_compare_and_swap( x, tmp, *y ) ) { tmp = *x; }
+    while ( !atomic_compare_and_swap( x, tmp, *y ) ) {
+        tmp = *x;
+    }
    *y = tmp;
 }


-
 // Define an atomic counter
 struct counter_t {
 public:
@@ -499,6 +544,7 @@ public:
    inline void setCount( int val ) { count = val; }
    // Get the current value of the count
    inline int getCount() const { return count; }
+
 private:
    counter_t( const counter_t & );
    counter_t &operator=( const counter_t & );
--- a/threadpool/atomic_list.h
+++ b/threadpool/atomic_list.h
@@ -1,14 +1,13 @@
 #ifndef included_AtomicModelAtomicList
 #define included_AtomicModelAtomicList

-#include <functional>
-#include <csignal>
 #include <atomic>
+#include <csignal>
+#include <functional>

 #include "threadpool/atomic_helpers.h"


-
 /** \class AtomicList
 *
 * \brief Maintain a sorted list of entries
@@ -25,12 +24,14 @@ public:
    /*!
     * \brief   Remove an item from the list
     * \details Find and remove first entry that meets the given criteria
-     * @return          Return the item that matches the criteria, or the default item if no item matches
-     * @param comp	 	Comparison function object (i.e. an object that satisfies
+     * @return          Return the item that matches the criteria,
+     *                  or the default item if no item matches
+     * @param compare   Comparison function object (i.e. an object that satisfies
     *                  the requirements of Compare) which returns true if the
     *                  given value meets the selection criteria.
     *                  The signature of the comparison function should be equivalent to:
     *                      bool cmp( const TYPE& value, ... );
+     * @param args      Additional arguments for the comparison
     */
    template<class Compare, class... Args>
    inline TYPE remove( Compare compare, Args... args );
@@ -42,11 +43,6 @@ public:
     * \brief   Insert an item
     * \details Insert an item into the list
     * @param x         Item to insert
-     * @param comp	 	Comparison function object (i.e. an object that satisfies
-     *                  the requirements of Compare) which returns true if the
-     *                  first argument is less than (i.e. is ordered before) the second. 
-     *                  The signature of the comparison function should be equivalent to:
-     *                      bool cmp(const TYPE &a, const TYPE &b);
     */
    inline void insert( TYPE x );

@@ -177,8 +173,6 @@ private:
 };


-
-
 #include "threadpool/atomic_list.hpp"

 #endif
--- a/threadpool/atomic_list.hpp
+++ b/threadpool/atomic_list.hpp
@@ -2,19 +2,17 @@
 #define included_AtomicList_hpp


-#include <stdexcept>
 #include <iostream>
+#include <stdexcept>
 #include <thread>


-
 /******************************************************************
 * Constructor                                                     *
 ******************************************************************/
 template<class TYPE, int MAX_SIZE, class COMPARE>
-AtomicList<TYPE,MAX_SIZE,COMPARE>::AtomicList( const TYPE& default_value, const COMPARE& comp ):
-    d_compare(comp),
-    d_default(default_value)
+AtomicList<TYPE, MAX_SIZE, COMPARE>::AtomicList( const TYPE &default_value, const COMPARE &comp )
+    : d_compare( comp ), d_default( default_value )
 {
    d_N        = 0;
    d_next[0]  = -1;
@@ -52,7 +50,8 @@ inline TYPE AtomicList<TYPE,MAX_SIZE,COMPARE>::remove( Compare compare, Args...
        // Test to see if the object passes compare
        bool test = compare( const_cast<TYPE &>( d_objects[next - 1] ), args... );
        if ( test ) {
-            // We want to return this object, update next to point to another entry and remove the entry
+            // We want to return this object, update next to point to another entry and remove the
+            // entry
            unlock( next, -3 );
            unlock( pos, next2 );
            pos = next;
@@ -187,7 +186,6 @@ inline bool AtomicList<TYPE,MAX_SIZE,COMPARE>::check( )
 }


-
 /******************************************************************
 * MemoryPool                                                      *
 ******************************************************************/
--- a/threadpool/test/test_atomic.cpp
+++ b/threadpool/test/test_atomic.cpp
@@ -1,15 +1,15 @@
 #include "threadpool/atomic_helpers.h"
 #include "common/UnitTest.h"
 #include "common/Utilities.h"
-#include <iostream>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string>
-#include <vector>
-#include <thread>
-#include <chrono>
-#include <functional>
 #include <atomic>
+#include <chrono>
+#include <cstdio>
+#include <cstdlib>
+#include <functional>
+#include <iostream>
+#include <string>
+#include <thread>
+#include <vector>


 #define perr std::cerr
@@ -108,7 +108,8 @@ int main( int, char *[] )
    for ( int i = 0; i < N_threads; i++ )
        threads[i].join();
    stop = std::chrono::high_resolution_clock::now();
-    double time_inc_parallel = std::chrono::duration<double>(stop-start).count() / ( N_count * N_threads );
+    double time_inc_parallel =
+        std::chrono::duration<double>( stop - start ).count() / ( N_count * N_threads );
    val = count.getCount();
    if ( val != N_count * N_threads ) {
        char tmp[100];
@@ -124,7 +125,8 @@ int main( int, char *[] )
    for ( int i = 0; i < N_threads; i++ )
        threads[i].join();
    stop = std::chrono::high_resolution_clock::now();
-    double time_dec_parallel = std::chrono::duration<double>(stop-start).count() / ( N_count * N_threads );
+    double time_dec_parallel =
+        std::chrono::duration<double>( stop - start ).count() / ( N_count * N_threads );
    val = count.getCount();
    if ( val != 0 ) {
        char tmp[100];
@@ -147,6 +149,6 @@ int main( int, char *[] )

    // Finished
    ut.report();
-    int N_errors = static_cast<int>( ut.NumFailGlobal() );
+    auto N_errors = static_cast<int>( ut.NumFailGlobal() );
    return N_errors;
 }
--- a/threadpool/test/test_atomic_list.cpp
+++ b/threadpool/test/test_atomic_list.cpp
@@ -1,16 +1,16 @@
 #include "threadpool/atomic_list.h"
 #include "common/UnitTest.h"
 #include "common/Utilities.h"
-#include <iostream>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string>
-#include <vector>
-#include <thread>
-#include <chrono>
-#include <functional>
-#include <atomic>
 #include <algorithm>
+#include <atomic>
+#include <chrono>
+#include <cstdio>
+#include <cstdlib>
+#include <functional>
+#include <iostream>
+#include <string>
+#include <thread>
+#include <vector>



@@ -23,11 +23,21 @@ static void modify_list( AtomicList<int,1024>& list )
        auto v3 = list.remove( []( int v ) { return v >= ( rand() / 8 ); } );
        auto v4 = list.remove( []( int v ) { return v >= ( rand() / 4 ); } );
        auto v5 = list.remove( []( int v ) { return v >= ( rand() / 2 ); } );
-        if ( v1 !=-1 ) { list.insert( v1 ); }
-        if ( v2 !=-1 ) { list.insert( v2 ); }
-        if ( v3 !=-1 ) { list.insert( v3 ); }
-        if ( v4 !=-1 ) { list.insert( v4 ); }
-        if ( v5 !=-1 ) { list.insert( v5 ); }
+        if ( v1 != -1 ) {
+            list.insert( v1 );
+        }
+        if ( v2 != -1 ) {
+            list.insert( v2 );
+        }
+        if ( v3 != -1 ) {
+            list.insert( v3 );
+        }
+        if ( v4 != -1 ) {
+            list.insert( v4 );
+        }
+        if ( v5 != -1 ) {
+            list.insert( v5 );
+        }
    }
 }

@@ -37,14 +47,14 @@ static bool check_list( const std::vector<int>& x, AtomicList<int,1024>& list )
    bool pass = list.check();
    pass      = pass && (int) x.size() == list.size();
    if ( pass ) {
-        for (size_t i=0; i<x.size(); i++)
-            pass = pass && x[i] == list.remove( [](int) { return true; } );
+        for ( int i : x )
+            pass = pass && i == list.remove( []( int ) { return true; } );
    }
    // Restore the list
    for ( int i = 0; i < list.size(); i++ )
        list.remove_first();
-    for (size_t i=0; i<x.size(); i++)
-        list.insert( x[i] );
+    for ( int i : x )
+        list.insert( i );
    return pass;
 }

@@ -56,7 +66,6 @@ static inline void clear_list(AtomicList<int,1024>& list )
 }


-
 /******************************************************************
 * The main program                                                *
 ******************************************************************/
@@ -90,13 +99,14 @@ int main( int, char *[] )
        ut.failure( "Basic sanity test" );

    // Clear the list
-    while ( list.remove( [](int) { return true; } ) != -1 ) {}
+    while ( list.remove( []( int ) { return true; } ) != -1 ) {
+    }

    // Create a list of known values
    // std::vector<int> data0(512);
    std::vector<int> data0( 5 * N_threads );
-    for (size_t i=0; i<data0.size(); i++)
-        data0[i] = rand();
+    for ( int &i : data0 )
+        i = rand();
    auto data = data0;
    std::sort( data.begin(), data.end() );

@@ -110,8 +120,8 @@ int main( int, char *[] )
    for ( int it = 0; it < N_it; it++ ) {
        clear_list( list );
        start = std::chrono::high_resolution_clock::now();
-        for (size_t i=0; i<data0.size(); i++)
-            list.insert( data0[i] );
+        for ( int i : data0 )
+            list.insert( i );
        stop = std::chrono::high_resolution_clock::now();
        time += ( stop - start );
    }
@@ -139,21 +149,22 @@ int main( int, char *[] )
        stop = std::chrono::high_resolution_clock::now();
        time += ( stop - start );
    }
-    printf("remove (ordered) time/item = %0.0f ns\n",1e9*time.count()/(N_it*data0.size()));
+    printf(
+        "remove (ordered) time/item = %0.0f ns\n", 1e9 * time.count() / ( N_it * data0.size() ) );

    // Test the cost to remove (out order)
    time = time.zero();
    for ( int it = 0; it < N_it; it++ ) {
        check_list( data, list );
        start = std::chrono::high_resolution_clock::now();
-        for (size_t i=0; i<data0.size(); i++) {
-            int tmp = data0[i];
+        for ( int tmp : data0 ) {
            list.remove( [tmp]( int v ) { return v == tmp; } );
        }
        stop = std::chrono::high_resolution_clock::now();
        time += ( stop - start );
    }
-    printf("remove (unordered) time/item = %0.0f ns\n",1e9*time.count()/(N_it*data0.size()));
+    printf(
+        "remove (unordered) time/item = %0.0f ns\n", 1e9 * time.count() / ( N_it * data0.size() ) );

    // Read/write to the list and check the results
    int64_t N0 = list.N_remove();
@@ -205,6 +216,6 @@ int main( int, char *[] )

    // Finished
    ut.report();
-    int N_errors = static_cast<int>( ut.NumFailGlobal() );
+    auto N_errors = static_cast<int>( ut.NumFailGlobal() );
    return N_errors;
 }
--- a/threadpool/test/test_thread_pool.cpp
+++ b/threadpool/test/test_thread_pool.cpp
@@ -5,15 +5,15 @@
 #include "threadpool/thread_pool.h"
 #include "common/UnitTest.h"
 #include "common/Utilities.h"
-#include <math.h>
 #include <algorithm>
+#include <cmath>
+#include <cstdio>
+#include <cstdlib>
 #include <iostream>
+#include <mutex>
 #include <stdexcept>
-#include <stdio.h>
-#include <stdlib.h>
 #include <string>
 #include <vector>
-#include <mutex>


 #define MAX( x, y ) ( ( x ) > ( y ) ? ( x ) : ( y ) )
@@ -82,7 +82,8 @@ void waste_cpu( int N )
 // Sleep for the given time
 // Note: since we may encounter interrupts, we may not sleep for the desired time
 //   so we need to perform the sleep in a loop
-void sleep_ms( int64_t N ) {
+void sleep_ms( int64_t N )
+{
    auto t1 = std::chrono::high_resolution_clock::now();
    auto t2 = std::chrono::high_resolution_clock::now();
    while ( to_ms( t2 - t1 ) < N ) {
@@ -91,9 +92,7 @@ void sleep_ms( int64_t N ) {
        t2 = std::chrono::high_resolution_clock::now();
    }
 }
-void sleep_s( int N ) {
-    sleep_ms(1000*N);
-}
+void sleep_s( int N ) { sleep_ms( 1000 * N ); }


 // Function to sleep for N seconds then increment a global count
@@ -135,7 +134,7 @@ void print_processor( ThreadPool *tpool )
    sprintf( tmp, "%i:  Thread,proc = %i,%i\n", rank, thread, processor );
    sleep_ms( 10 * rank );
    print_processor_mutex.lock();
-    std::cout << tmp;
+    pout << tmp;
    print_processor_mutex.unlock();
    sleep_ms( 100 );
 }
@@ -161,7 +160,9 @@ int test_member_thread( ThreadPool *tpool )
 }


-// Functions to test the templates
+/******************************************************************
+ * Test the TPOOL_ADD_WORK macro with variable number of arguments *
+ ******************************************************************/
 static int myfun0() { return 0; }
 static int myfun1( int ) { return 1; }
 static int myfun2( int, float ) { return 2; }
@@ -170,60 +171,6 @@ static int myfun4( int, float, double, char ) { return 4; }
 static int myfun5( int, float, double, char, std::string ) { return 5; }
 static int myfun6( int, float, double, char, std::string, int ) { return 6; }
 static int myfun7( int, float, double, char, std::string, int, int ) { return 7; }
-
-
-// Function to test instantiation of functions with different number of arguments
-// clang-format off
-static void vfunarg00() {}
-static void vfunarg01( int ) {}
-static void vfunarg02( int, char ) {}
-static void vfunarg03( int, char, double ) {}
-static void vfunarg04( int, char, double, int ) {}
-static void vfunarg05( int, char, double, int, char ) {}
-static void vfunarg06( int, char, double, int, char, double ) {}
-static void vfunarg07( int, char, double, int, char, double, int ) {}
-static void vfunarg08( int, char, double, int, char, double, int, char ) {}
-static void vfunarg09( int, char, double, int, char, double, int, char, double ) {}
-static void vfunarg10( int, char, double, int, char, double, int, char, double, int ) {}
-static void vfunarg11( int, char, double, int, char, double, int, char, double, int, char ) {}
-static void vfunarg12( int, char, double, int, char, double, int, char, double, int, char, double ) {}
-static void vfunarg13( int, char, double, int, char, double, int, char, double, int, char, double, int ) {}
-static void vfunarg14( int, char, double, int, char, double, int, char, double, int, char, double, int, char ) {}
-static void vfunarg15( int, char, double, int, char, double, int, char, double, int, char, double, int, char, double ) {}
-static void vfunarg16( int, char, double, int, char, double, int, char, double, int, char, double, int, char, double, int ) {}
-static void vfunarg17( int, char, double, int, char, double, int, char, double, int, char, double, int, char, double, int, char ) {}
-static void vfunarg18( int, char, double, int, char, double, int, char, double, int, char, double, int, char, double, int, char, double ) {}
-static void vfunarg19( int, char, double, int, char, double, int, char, double, int, char, double, int, char, double, int, char, double, int ) {}
-static void vfunarg20( int, char, double, int, char, double, int, char, double, int, char, double, int, char, double, int, char, double, int, char ) {}
-static void vfunarg21( int, char, double, int, char, double, int, char, double, int, char, double, int, char, double, int, char, double, int, char, double ) {}
-static void vfunarg22( int, char, double, int, char, double, int, char, double, int, char, double, int, char, double, int, char, double, int, char, double, int ) {}
-static void vfunarg23( int, char, double, int, char, double, int, char, double, int, char, double, int, char, double, int, char, double, int, char, double, int, char ) {}
-static void vfunarg24( int, char, double, int, char, double, int, char, double, int, char, double, int, char, double, int, char, double, int, char, double, int, char, double ) {}
-static int funarg00() { return 0; }
-static int funarg01( int ) { return 1; }
-static int funarg02( int, char ) { return 2; }
-static int funarg03( int, char, double ) { return 3; }
-static int funarg04( int, char, double, int ) { return 4; }
-static int funarg05( int, char, double, int, char ) { return 5; }
-static int funarg06( int, char, double, int, char, double ) { return 6; }
-static int funarg07( int, char, double, int, char, double, int ) { return 7; }
-static int funarg08( int, char, double, int, char, double, int, char ) { return 8; }
-static int funarg09( int, char, double, int, char, double, int, char, double ) { return 9; }
-static int funarg10( int, char, double, int, char, double, int, char, double, int ) { return 10; }
-static int funarg11( int, char, double, int, char, double, int, char, double, int, char ) { return 11; }
-static int funarg12( int, char, double, int, char, double, int, char, double, int, char, double ) { return 12; }
-static int funarg13( int, char, double, int, char, double, int, char, double, int, char, double, int ) { return 13; }
-static int funarg14( int, char, double, int, char, double, int, char, double, int, char, double, int, char ) { return 14; }
-static int funarg15( int, char, double, int, char, double, int, char, double, int, char, double, int, char, double ) { return 15; }
-static int funarg16( int, char, double, int, char, double, int, char, double, int, char, double, int, char, double, int ) { return 16; }
-static int funarg17( int, char, double, int, char, double, int, char, double, int, char, double, int, char, double, int, char ) { return 17; }
-static int funarg18( int, char, double, int, char, double, int, char, double, int, char, double, int, char, double, int, char, double ) { return 18; }
-static int funarg19( int, char, double, int, char, double, int, char, double, int, char, double, int, char, double, int, char, double, int ) { return 19; }
-static int funarg20( int, char, double, int, char, double, int, char, double, int, char, double, int, char, double, int, char, double, int, char ) { return 20; }
-static int funarg21( int, char, double, int, char, double, int, char, double, int, char, double, int, char, double, int, char, double, int, char, double ) { return 21; }
-static int funarg22( int, char, double, int, char, double, int, char, double, int, char, double, int, char, double, int, char, double, int, char, double, int ) { return 22; }
-static int funarg23( int, char, double, int, char, double, int, char, double, int, char, double, int, char, double, int, char, double, int, char, double, int, char ) { return 23; }
-static int funarg24( int, char, double, int, char, double, int, char, double, int, char, double, int, char, double, int, char, double, int, char, double, int, char, double ) { return 24; }
 static int test_function_arguements( ThreadPool *tpool )
 {
    int N_errors = 0;
@@ -231,83 +178,51 @@ static int test_function_arguements( ThreadPool *tpool )
    ThreadPool::thread_id_t id0 = TPOOL_ADD_WORK( tpool, myfun0, ( nullptr ) );
    ThreadPool::thread_id_t id1 = TPOOL_ADD_WORK( tpool, myfun1, ( (int) 1 ) );
    ThreadPool::thread_id_t id2 = TPOOL_ADD_WORK( tpool, myfun2, ( (int) 1, (float) 2 ) );
-    ThreadPool::thread_id_t id3 = TPOOL_ADD_WORK( tpool, myfun3, ( (int) 1, (float) 2, (double) 3 ) );
-    ThreadPool::thread_id_t id4 = TPOOL_ADD_WORK( tpool, myfun4, ( (int) 1, (float) 2, (double) 3, (char) 4 ) );
-    ThreadPool::thread_id_t id5 = TPOOL_ADD_WORK( tpool, myfun5, ( (int) 1, (float) 2, (double) 3, (char) 4, std::string( "test" ) ) );
-    ThreadPool::thread_id_t id52= TPOOL_ADD_WORK( tpool, myfun5, ( (int) 1, (float) 2, (double) 3, (char) 4, std::string( "test" ) ), -1 );
-    ThreadPool::thread_id_t id6 = TPOOL_ADD_WORK( tpool, myfun6, ( (int) 1, (float) 2, (double) 3, (char) 4, std::string( "test" ), (int) 1 ) );
-    ThreadPool::thread_id_t id7 = TPOOL_ADD_WORK( tpool, myfun7, ( (int) 1, (float) 2, (double) 3, (char) 4, std::string( "test" ), (int) 1, (int) 1 ) );
+    ThreadPool::thread_id_t id3 =
+        TPOOL_ADD_WORK( tpool, myfun3, ( (int) 1, (float) 2, (double) 3 ) );
+    ThreadPool::thread_id_t id4 =
+        TPOOL_ADD_WORK( tpool, myfun4, ( (int) 1, (float) 2, (double) 3, (char) 4 ) );
+    ThreadPool::thread_id_t id5 = TPOOL_ADD_WORK(
+        tpool, myfun5, ( (int) 1, (float) 2, (double) 3, (char) 4, std::string( "test" ) ) );
+    ThreadPool::thread_id_t id52 = TPOOL_ADD_WORK(
+        tpool, myfun5, ( (int) 1, (float) 2, (double) 3, (char) 4, std::string( "test" ) ), -1 );
+    ThreadPool::thread_id_t id6 = TPOOL_ADD_WORK( tpool, myfun6,
+        ( (int) 1, (float) 2, (double) 3, (char) 4, std::string( "test" ), (int) 1 ) );
+    ThreadPool::thread_id_t id7 = TPOOL_ADD_WORK( tpool, myfun7,
+        ( (int) 1, (float) 2, (double) 3, (char) 4, std::string( "test" ), (int) 1, (int) 1 ) );
    tpool->wait_pool_finished();
-    if ( !tpool->isFinished( id0 ) ) { N_errors++; }
-    if ( tpool->getFunctionRet<int>( id0 ) != 0 ) { N_errors++; }
-    if ( tpool->getFunctionRet<int>( id1 ) != 1 ) { N_errors++; }
-    if ( tpool->getFunctionRet<int>( id2 ) != 2 ) { N_errors++; }
-    if ( tpool->getFunctionRet<int>( id3 ) != 3 ) { N_errors++; }
-    if ( tpool->getFunctionRet<int>( id4 ) != 4 ) { N_errors++; }
-    if ( tpool->getFunctionRet<int>( id5 ) != 5 ) { N_errors++; }
-    if ( tpool->getFunctionRet<int>( id52 ) != 5 ){ N_errors++; }
-    if ( tpool->getFunctionRet<int>( id6 ) != 6 ) { N_errors++; }
-    if ( tpool->getFunctionRet<int>( id7 ) != 7 ) { N_errors++; }
-    // Test all the different numbers of arguments allowed
-    TPOOL_ADD_WORK( tpool, vfunarg00, ( nullptr ) );
-    TPOOL_ADD_WORK( tpool, vfunarg01, ( 1 ) );
-    TPOOL_ADD_WORK( tpool, vfunarg02, ( 1, 'a' ) );
-    TPOOL_ADD_WORK( tpool, vfunarg03, ( 1, 'a', 3.0 ) );
-    TPOOL_ADD_WORK( tpool, vfunarg04, ( 1, 'a', 3.0, 4 ) );
-    TPOOL_ADD_WORK( tpool, vfunarg05, ( 1, 'a', 3.0, 4, 'e' ) );
-    TPOOL_ADD_WORK( tpool, vfunarg06, ( 1, 'a', 3.0, 4, 'e', 6.0 ) );
-    TPOOL_ADD_WORK( tpool, vfunarg07, ( 1, 'a', 3.0, 4, 'e', 6.0, 7 ) );
-    TPOOL_ADD_WORK( tpool, vfunarg08, ( 1, 'a', 3.0, 4, 'e', 6.0, 7, 'h' ) );
-    TPOOL_ADD_WORK( tpool, vfunarg09, ( 1, 'a', 3.0, 4, 'e', 6.0, 7, 'h', 9.0 ) );
-    TPOOL_ADD_WORK( tpool, vfunarg10, ( 1, 'a', 3.0, 4, 'e', 6.0, 7, 'h', 9.0, 10 ) );
-    TPOOL_ADD_WORK( tpool, vfunarg11, ( 1, 'a', 3.0, 4, 'e', 6.0, 7, 'h', 9.0, 10, 'k' ) );
-    TPOOL_ADD_WORK( tpool, vfunarg12, ( 1, 'a', 3.0, 4, 'e', 6.0, 7, 'h', 9.0, 10, 'k', 12.0 ) );
-    TPOOL_ADD_WORK( tpool, vfunarg13, ( 1, 'a', 3.0, 4, 'e', 6.0, 7, 'h', 9.0, 10, 'k', 12.0, 13 ) );
-    TPOOL_ADD_WORK( tpool, vfunarg14, ( 1, 'a', 3.0, 4, 'e', 6.0, 7, 'h', 9.0, 10, 'k', 12.0, 13, 'n' ) );
-    TPOOL_ADD_WORK( tpool, vfunarg15, ( 1, 'a', 3.0, 4, 'e', 6.0, 7, 'h', 9.0, 10, 'k', 12.0, 13, 'n', 15.0 ) );
-    TPOOL_ADD_WORK( tpool, vfunarg16, ( 1, 'a', 3.0, 4, 'e', 6.0, 7, 'h', 9.0, 10, 'k', 12.0, 13, 'n', 15.0, 16 ) );
-    TPOOL_ADD_WORK( tpool, vfunarg17, ( 1, 'a', 3.0, 4, 'e', 6.0, 7, 'h', 9.0, 10, 'k', 12.0, 13, 'n', 15.0, 16, 'q' ) );
-    TPOOL_ADD_WORK( tpool, vfunarg18, ( 1, 'a', 3.0, 4, 'e', 6.0, 7, 'h', 9.0, 10, 'k', 12.0, 13, 'n', 15.0, 16, 'q', 18.0 ) );
-    TPOOL_ADD_WORK( tpool, vfunarg19, ( 1, 'a', 3.0, 4, 'e', 6.0, 7, 'h', 9.0, 10, 'k', 12.0, 13, 'n', 15.0, 16, 'q', 18.0, 19 ) );
-    TPOOL_ADD_WORK( tpool, vfunarg20, ( 1, 'a', 3.0, 4, 'e', 6.0, 7, 'h', 9.0, 10, 'k', 12.0, 13, 'n', 15.0, 16, 'q', 18.0, 19, 't' ) );
-    TPOOL_ADD_WORK( tpool, vfunarg21, ( 1, 'a', 3.0, 4, 'e', 6.0, 7, 'h', 9.0, 10, 'k', 12.0, 13, 'n', 15.0, 16, 'q', 18.0, 19, 't', 21.0 ) );
-    TPOOL_ADD_WORK( tpool, vfunarg22, ( 1, 'a', 3.0, 4, 'e', 6.0, 7, 'h', 9.0, 10, 'k', 12.0, 13, 'n', 15.0, 16, 'q', 18.0, 19, 't', 21.0, 22 ) );
-    TPOOL_ADD_WORK( tpool, vfunarg23, ( 1, 'a', 3.0, 4, 'e', 6.0, 7, 'h', 9.0, 10, 'k', 12.0, 13, 'n', 15.0, 16, 'q', 18.0, 19, 't', 21.0, 22, 'w' ) );
-    TPOOL_ADD_WORK( tpool, vfunarg24, ( 1, 'a', 3.0, 4, 'e', 6.0, 7, 'h', 9.0, 10, 'k', 12.0, 13, 'n', 15.0, 16, 'q', 18.0, 19, 't', 21.0, 22, 'w', 24.0 ) );
-    std::vector<ThreadPool::thread_id_t> ids( 25 );
-    ids[0]  = TPOOL_ADD_WORK( tpool, funarg00, ( nullptr ) );
-    ids[1]  = TPOOL_ADD_WORK( tpool, funarg01, ( 1 ) );
-    ids[2]  = TPOOL_ADD_WORK( tpool, funarg02, ( 1, 'a' ) );
-    ids[3]  = TPOOL_ADD_WORK( tpool, funarg03, ( 1, 'a', 3.0 ) );
-    ids[4]  = TPOOL_ADD_WORK( tpool, funarg04, ( 1, 'a', 3.0, 4 ) );
-    ids[5]  = TPOOL_ADD_WORK( tpool, funarg05, ( 1, 'a', 3.0, 4, 'e' ) );
-    ids[6]  = TPOOL_ADD_WORK( tpool, funarg06, ( 1, 'a', 3.0, 4, 'e', 6.0 ) );
-    ids[7]  = TPOOL_ADD_WORK( tpool, funarg07, ( 1, 'a', 3.0, 4, 'e', 6.0, 7 ) );
-    ids[8]  = TPOOL_ADD_WORK( tpool, funarg08, ( 1, 'a', 3.0, 4, 'e', 6.0, 7, 'h' ) );
-    ids[9]  = TPOOL_ADD_WORK( tpool, funarg09, ( 1, 'a', 3.0, 4, 'e', 6.0, 7, 'h', 9.0 ) );
-    ids[10] = TPOOL_ADD_WORK( tpool, funarg10, ( 1, 'a', 3.0, 4, 'e', 6.0, 7, 'h', 9.0, 10 ) );
-    ids[11] = TPOOL_ADD_WORK( tpool, funarg11, ( 1, 'a', 3.0, 4, 'e', 6.0, 7, 'h', 9.0, 10, 'k' ) );
-    ids[12] = TPOOL_ADD_WORK( tpool, funarg12, ( 1, 'a', 3.0, 4, 'e', 6.0, 7, 'h', 9.0, 10, 'k', 12.0 ) );
-    ids[13] = TPOOL_ADD_WORK( tpool, funarg13, ( 1, 'a', 3.0, 4, 'e', 6.0, 7, 'h', 9.0, 10, 'k', 12.0, 13 ) );
-    ids[14] = TPOOL_ADD_WORK( tpool, funarg14, ( 1, 'a', 3.0, 4, 'e', 6.0, 7, 'h', 9.0, 10, 'k', 12.0, 13, 'h' ) );
-    ids[15] = TPOOL_ADD_WORK( tpool, funarg15, ( 1, 'a', 3.0, 4, 'e', 6.0, 7, 'h', 9.0, 10, 'k', 12.0, 13, 'h', 15.0 ) );
-    ids[16] = TPOOL_ADD_WORK( tpool, funarg16, ( 1, 'a', 3.0, 4, 'e', 6.0, 7, 'h', 9.0, 10, 'k', 12.0, 13, 'n', 15.0, 16 ) );
-    ids[17] = TPOOL_ADD_WORK( tpool, funarg17, ( 1, 'a', 3.0, 4, 'e', 6.0, 7, 'h', 9.0, 10, 'k', 12.0, 13, 'n', 15.0, 16, 'q' ) );
-    ids[18] = TPOOL_ADD_WORK( tpool, funarg18, ( 1, 'a', 3.0, 4, 'e', 6.0, 7, 'h', 9.0, 10, 'k', 12.0, 13, 'n', 15.0, 16, 'q', 18.0 ) );
-    ids[19] = TPOOL_ADD_WORK( tpool, funarg19, ( 1, 'a', 3.0, 4, 'e', 6.0, 7, 'h', 9.0, 10, 'k', 12.0, 13, 'n', 15.0, 16, 'q', 18.0, 19 ) );
-    ids[20] = TPOOL_ADD_WORK( tpool, funarg20, ( 1, 'a', 3.0, 4, 'e', 6.0, 7, 'h', 9.0, 10, 'k', 12.0, 13, 'n', 15.0, 16, 'q', 18.0, 19, 't' ) );
-    ids[21] = TPOOL_ADD_WORK( tpool, funarg21, ( 1, 'a', 3.0, 4, 'e', 6.0, 7, 'h', 9.0, 10, 'k', 12.0, 13, 'n', 15.0, 16, 'q', 18.0, 19, 't', 21.0 ) );
-    ids[22] = TPOOL_ADD_WORK( tpool, funarg22, ( 1, 'a', 3.0, 4, 'e', 6.0, 7, 'h', 9.0, 10, 'k', 12.0, 13, 'n', 15.0, 16, 'q', 18.0, 19, 't', 21.0, 22 ) );
-    ids[23] = TPOOL_ADD_WORK( tpool, funarg23, ( 1, 'a', 3.0, 4, 'e', 6.0, 7, 'h', 9.0, 10, 'k', 12.0, 13, 'n', 15.0, 16, 'q', 18.0, 19, 't', 21.0, 22, 'w' ) );
-    ids[24] = TPOOL_ADD_WORK( tpool, funarg24, ( 1, 'a', 3.0, 4, 'e', 6.0, 7, 'h', 9.0, 10, 'k', 12.0, 13, 'n', 15.0, 16, 'q', 18.0, 19, 't', 21.0, 22, 'w', 24.0 ) );
-    tpool->wait_all( ids );
-    for ( size_t i = 0; i < ids.size(); i++ ) {
-        if ( tpool->getFunctionRet<int>( ids[i] ) != static_cast<int>( i ) )
+    if ( !tpool->isFinished( id0 ) ) {
+        N_errors++;
+    }
+    if ( tpool->getFunctionRet<int>( id0 ) != 0 ) {
+        N_errors++;
+    }
+    if ( tpool->getFunctionRet<int>( id1 ) != 1 ) {
+        N_errors++;
+    }
+    if ( tpool->getFunctionRet<int>( id2 ) != 2 ) {
+        N_errors++;
+    }
+    if ( tpool->getFunctionRet<int>( id3 ) != 3 ) {
+        N_errors++;
+    }
+    if ( tpool->getFunctionRet<int>( id4 ) != 4 ) {
+        N_errors++;
+    }
+    if ( tpool->getFunctionRet<int>( id5 ) != 5 ) {
+        N_errors++;
+    }
+    if ( tpool->getFunctionRet<int>( id52 ) != 5 ) {
+        N_errors++;
+    }
+    if ( tpool->getFunctionRet<int>( id6 ) != 6 ) {
+        N_errors++;
+    }
+    if ( tpool->getFunctionRet<int>( id7 ) != 7 ) {
        N_errors++;
    }
    return N_errors;
 }
-// clang-format on


 /******************************************************************
@@ -323,15 +238,15 @@ public:
        NULL_USE( dummy );
    }
    // User defined run (can do anything)
-    virtual void run() override
+    void run() override
    {
        // Perform the tasks
        printf( "Hello work from UserWorkItem (void)" );
    }
    // Will the routine return a result
-    virtual bool has_result() const override { return false; }
+    bool has_result() const override { return false; }
    // User defined destructor
-    virtual ~UserWorkItemVoid() {}
+    ~UserWorkItemVoid() override = default;
 };
 class UserWorkItemInt : public ThreadPool::WorkItemRet<int>
 {
@@ -343,36 +258,29 @@ public:
        NULL_USE( dummy );
    }
    // User defined run (can do anything)
-    virtual void run() override
+    void run() override
    {
        // Perform the tasks
        printf( "Hello work from UserWorkItem (int)" );
        // Store the results (it's type will match the template)
        ThreadPool::WorkItemRet<int>::d_result = 1;
    }
-    // Will the routine return a result
-    virtual bool has_result() const override { return false; }
    // User defined destructor
-    virtual ~UserWorkItemInt() {}
+    ~UserWorkItemInt() override = default;
 };


 /******************************************************************
 * test the time to run N tasks in parallel                        *
 ******************************************************************/
-inline double run_parallel( ThreadPool *tpool, int N_tasks, int N_work )
+template<class Ret, class... Args>
+inline double launchAndTime( ThreadPool &tpool, int N, Ret ( *routine )( Args... ), Args... args )
 {
-    // Make sure the thread pool is empty
-    tpool->wait_pool_finished();
-    // Add the work
-    std::vector<ThreadPool::thread_id_t> ids;
-    ids.reserve( N_tasks );
+    tpool.wait_pool_finished();
    auto start = std::chrono::high_resolution_clock::now();
-    for ( int i = 0; i < N_tasks; i++ )
-        ids.push_back( TPOOL_ADD_WORK( tpool, waste_cpu, ( N_work ) ) );
-    // Wait for the thread pool to finish
-    tpool->wait_pool_finished();
-    // Compute the time spent running the tasks
+    for ( int i = 0; i < N; i++ )
+        ThreadPool_add_work( &tpool, 0, routine, args... );
+    tpool.wait_pool_finished();
    auto stop = std::chrono::high_resolution_clock::now();
    return std::chrono::duration<double>( stop - start ).count();
 }
@@ -417,17 +325,19 @@ void test_FIFO( UnitTest& ut, ThreadPool& tpool )
 {
    int rank    = getRank();
    int size    = getSize();
+    const int N = 4000;
    for ( int r = 0; r < size; r++ ) {
        barrier();
        if ( r != rank )
            continue;
        std::vector<ThreadPool::thread_id_t> ids;
-        for (size_t i=0; i<4000; i++)
-            ids.push_back( TPOOL_ADD_WORK( &tpool, sleep_inc2, ( 0.001 ) ) );
+        ids.reserve( N );
+        for ( size_t i = 0; i < N; i++ )
+            ids.emplace_back( TPOOL_ADD_WORK( &tpool, sleep_inc2, ( 0.001 ) ) );
        bool pass = true;
        while ( tpool.N_queued() > 0 ) {
            int i1 = -1, i2 = ids.size();
-            for (size_t i=0; i<ids.size(); i++) {
+            for ( int i = N - 1; i >= 0; i-- ) {
                bool started = ids[i].started();
                if ( started )
                    i1 = std::max<int>( i1, i ); // Last index to processing item
@@ -510,11 +420,7 @@ int main( int argc, char *argv[] )

    // Get the number of processors availible
    barrier();
-    int N_procs = 0;
-    try {
-        N_procs = ThreadPool::getNumberOfProcessors();
-    } catch ( ... ) {
-    }
+    int N_procs = ThreadPool::getNumberOfProcessors();
    if ( N_procs > 0 )
        ut.passes( "getNumberOfProcessors" );
    else
@@ -524,15 +430,11 @@ int main( int argc, char *argv[] )

    // Get the processor affinities for the process
    barrier();
-    std::vector<int> cpus;
-    try {
-        cpus = ThreadPool::getProcessAffinity();
+    std::vector<int> cpus = ThreadPool::getProcessAffinity();
    printp( "%i cpus for current process: ", (int) cpus.size() );
-        for ( size_t i = 0; i < cpus.size(); i++ )
-            printp( "%i ", cpus[i] );
+    for ( int cpu : cpus )
+        printp( "%i ", cpu );
    printp( "\n" );
-    } catch ( ... ) {
-    }
    if ( !cpus.empty() ) {
        ut.passes( "getProcessAffinity" );
    } else {
@@ -559,8 +461,8 @@ int main( int argc, char *argv[] )
            cpus                  = ThreadPool::getProcessAffinity();
            std::vector<int> cpus = ThreadPool::getProcessAffinity();
            printp( "%i cpus for current process (updated): ", (int) cpus.size() );
-            for ( size_t i = 0; i < cpus.size(); i++ )
-                printp( "%i ", cpus[i] );
+            for ( int cpu : cpus )
+                printp( "%i ", cpu );
            printp( "\n" );
            pass = cpus.size() > 1;
        } else {
@@ -630,8 +532,8 @@ int main( int argc, char *argv[] )
            std::vector<int> procs_thread = tpool.getThreadAffinity( i );
            if ( procs_thread != procs ) {
                printp( "%i: Initial thread affinity: ", rank );
-                for ( size_t i = 0; i < procs_thread.size(); i++ )
-                    printp( "%i ", procs_thread[i] );
+                for ( int i : procs_thread )
+                    printp( "%i ", i );
                printp( "\n" );
                pass = false;
            }
@@ -653,8 +555,8 @@ int main( int argc, char *argv[] )
                std::vector<int> procs_thread2 = tpool.getThreadAffinity( i );
                if ( procs_thread2 != procs_thread ) {
                    printp( "%i: Final thread affinity: ", rank );
-                    for ( size_t i = 0; i < procs_thread.size(); i++ )
-                        printp( "%i ", procs_thread[i] );
+                    for ( int i : procs_thread )
+                        printp( "%i ", i );
                    printp( "\n" );
                    pass = false;
                }
@@ -674,8 +576,8 @@ int main( int argc, char *argv[] )
    for ( int i = 0; i < N_threads; i++ ) {
        std::vector<int> procs_thread = tpool.getThreadAffinity( i );
        printp( "Thread affinity: " );
-        for ( size_t i = 0; i < procs_thread.size(); i++ )
-            printp( "%i ", procs_thread[i] );
+        for ( int i : procs_thread )
+            printp( "%i ", i );
        printp( "\n" );
    }

@@ -683,9 +585,7 @@ int main( int argc, char *argv[] )
    barrier();
    ThreadPool::set_OS_warnings( 1 );
    print_processor( &tpool );
-    for ( int i = 0; i < N_threads; i++ )
-        TPOOL_ADD_WORK( &tpool, print_processor, ( &tpool ) );
-    tpool.wait_pool_finished();
+    launchAndTime( tpool, N_threads, print_processor, &tpool );

    // Run some basic tests
    barrier();
@@ -730,18 +630,12 @@ int main( int argc, char *argv[] )
    sleep_inc( 1 );
    stop                  = std::chrono::high_resolution_clock::now();
    double sleep_serial   = std::chrono::duration<double>( stop - start ).count();
-    ids2.clear();
-    start = std::chrono::high_resolution_clock::now();
-    for ( int i = 0; i < N_threads; i++ )
-        ids2.push_back( TPOOL_ADD_WORK( &tpool, sleep_inc, ( 1 ) ) );
-    tpool.wait_all( N_procs_used, &ids2[0] );
-    stop = std::chrono::high_resolution_clock::now();
-    ids2.clear();
-    double sleep_parallel = std::chrono::duration<double>(stop-start).count();
+    double sleep_parallel = launchAndTime( tpool, N_threads, sleep_inc, 1 );
    double sleep_speedup  = N_procs_used * sleep_serial / sleep_parallel;
    printf( "%i:  Speedup on %i sleeping threads: %0.3f\n", rank, N_procs_used, sleep_speedup );
    printf( "%i:    ts = %0.3f, tp = %0.3f\n", rank, sleep_serial, sleep_parallel );
-    if ( fabs( sleep_serial - 1.0 ) < 0.05 && fabs( sleep_parallel - 1.0 ) < 0.25 && sleep_speedup>3 )
+    if ( fabs( sleep_serial - 1.0 ) < 0.05 && fabs( sleep_parallel - 1.0 ) < 0.25 &&
+         sleep_speedup > 3 )
        ut.passes( "Passed thread sleep" );
    else
        ut.failure( "Failed thread sleep" );
@@ -773,8 +667,8 @@ int main( int argc, char *argv[] )
        stop               = std::chrono::high_resolution_clock::now();
        double time_serial = std::chrono::duration<double>( stop - start ).count();
        // Run in parallel
-        double time_parallel2 = run_parallel( &tpool, N_procs_used, N / 1000 );
-        double time_parallel  = run_parallel( &tpool, N_procs_used, N );
+        double time_parallel  = launchAndTime( tpool, N_procs_used, waste_cpu, N );
+        double time_parallel2 = launchAndTime( tpool, N_procs_used, waste_cpu, N / 1000 );
        double speedup        = N_procs_used * time_serial / time_parallel;
        printf( "%i:  Speedup on %i procs: %0.3f\n", rank, N_procs_used, speedup );
        printf( "%i:    ts = %0.3f, tp = %0.3f, tp2 = %0.3f\n", rank, time_serial, time_parallel,
@@ -849,8 +743,8 @@ int main( int argc, char *argv[] )
        id = tpool.add_work( wait3, 50 );
        tpool.wait( id );
        bool pass = true;
-        for (size_t i=0; i<ids.size(); i++)
-            pass = pass && ids[i].finished();
+        for ( auto &id : ids )
+            pass = pass && id.finished();
        ids.clear();
        if ( pass )
            ut.passes( "Dependencies2" );
@@ -908,9 +802,9 @@ int main( int argc, char *argv[] )
        printp( "   time = %0.0f ms\n", 1e3 * time );
        printp( "   time / cycle = %0.0f us\n", 1e6 * time / N_it );
        printp( "   average time / item = %0.0f ns\n", 1e9 * time / ( N_it * N_work ) );
-        printp( "      create = %i ns\n", static_cast<int>( time_create / ( N_it * N_work ) ) );
-        printp( "      run    = %i ns\n", static_cast<int>( time_run    / ( N_it * N_work ) ) );
-        printp( "      delete = %i us\n", static_cast<int>( time_delete / ( N_it * N_work ) ) );
+        printp( "      create = %i ns\n", time_create / ( N_it * N_work ) );
+        printp( "      run    = %i ns\n", time_run / ( N_it * N_work ) );
+        printp( "      delete = %i us\n", time_delete / ( N_it * N_work ) );
    }

    // Test the timing adding a single item
@@ -950,8 +844,8 @@ int main( int argc, char *argv[] )
        printp( "   time = %0.0f ms\n", 1e3 * time );
        printp( "   time / cycle = %0.0f us\n", 1e6 * time / N_it );
        printp( "   average time / item = %0.0f ns\n", 1e9 * time / ( N_it * N_work ) );
-        printp( "      create and add = %i ns\n", static_cast<int>( time_add / ( N_it * N_work ) ) );
-        printp( "      wait = %i us\n", static_cast<int>( time_wait / ( N_it * N_work ) ) );
+        printp( "      create and add = %i ns\n", time_add / ( N_it * N_work ) );
+        printp( "      wait = %i us\n", time_wait / ( N_it * N_work ) );
    }

    // Test the timing pre-creating the work items and adding multiple at a time
@@ -995,9 +889,9 @@ int main( int argc, char *argv[] )
        printp( "   time = %0.0f ms\n", 1e3 * time );
        printp( "   time / cycle = %0.0f us\n", 1e6 * time / N_it );
        printp( "   average time / item = %0.0f ns\n", 1e9 * time / ( N_it * N_work ) );
-        printp( "      create = %i ns\n", static_cast<int>( time_create_work / ( N_it * N_work ) ) );
-        printp( "      add = %i ns\n",  static_cast<int>( time_add_work / ( N_it * N_work ) ) );
-        printp( "      wait = %i ns\n", static_cast<int>( time_wait_work / ( N_it * N_work ) ) );
+        printp( "      create = %i ns\n", time_create_work / ( N_it * N_work ) );
+        printp( "      add = %i ns\n", time_add_work / ( N_it * N_work ) );
+        printp( "      wait = %i ns\n", time_wait_work / ( N_it * N_work ) );
    }

    // Run a dependency test that tests a simple case that should keep the thread pool busy
@@ -1035,8 +929,8 @@ int main( int argc, char *argv[] )
    barrier();
    pass = true;
    try {
-        ThreadPool *tpool = new ThreadPool( MAX_NUM_THREADS - 1 );
-        if ( tpool->getNumThreads() != MAX_NUM_THREADS - 1 )
+        ThreadPool *tpool = new ThreadPool( ThreadPool::MAX_NUM_THREADS - 1 );
+        if ( tpool->getNumThreads() != ThreadPool::MAX_NUM_THREADS - 1 )
            pass = false;
        if ( !ThreadPool::is_valid( tpool ) )
            pass = false;
@@ -1056,14 +950,14 @@ int main( int argc, char *argv[] )
    // Print the test results
    barrier();
    ut.report();
-    int N_errors = static_cast<int>( ut.NumFailGlobal() );
+    auto N_errors = static_cast<int>( ut.NumFailGlobal() );

    // Shudown MPI
    pout << "Shutting down\n";
    barrier();
 #ifdef USE_TIMER
    if ( rank == 0 )
-        MemoryApp::print( std::cout );
+        MemoryApp::print( pout );
 #endif
 #ifdef USE_MPI
    MPI_Finalize();
--- a/threadpool/thread_pool.cpp
+++ b/threadpool/thread_pool.cpp
@@ -5,14 +5,14 @@
 #include "ProfilerApp.h"
 #include <algorithm>
 #include <bitset>
+#include <chrono>
 #include <climits>
+#include <cstdio>
+#include <cstdlib>
 #include <iostream>
 #include <stdexcept>
-#include <stdio.h>
-#include <stdlib.h>
-#include <typeinfo>
 #include <thread>
-#include <chrono>
+#include <typeinfo>


 #define perr std::cerr
@@ -22,6 +22,15 @@

 // OS specific includes / definitions
 // clang-format off
+#if defined( WIN32 ) || defined( _WIN32 ) || defined( WIN64 ) || defined( _WIN64 )
+    #define USE_WINDOWS
+#elif defined( __APPLE__ )
+    #define USE_MAC
+#elif defined( __linux ) || defined( __unix ) || defined( __posix )
+    #define USE_LINUX
+#else
+    #error Unknown OS
+#endif
 #if defined( USE_WINDOWS )
    #include <process.h>
    #include <windows.h>
@@ -73,8 +82,9 @@
    } while ( 0 )
 #endif
 #if MONITOR_THREADPOOL_PERFORMANCE == 1
-    #define accumulate( x, t1, t2 ) AtomicOperations::atomic_add( &x, \
-        std::chrono::duration_cast<std::chrono::nanoseconds>(t2-t1).count() );
+#define accumulate( x, t1, t2 )   \
+    AtomicOperations::atomic_add( \
+        &x, std::chrono::duration_cast<std::chrono::nanoseconds>( t2 - t1 ).count() );
 #endif


@@ -87,7 +97,10 @@
 template<class T>
 void quicksort( int N, T *data );
 template<class T>
-inline void quicksort( std::vector<T> &x ) { quicksort((int)x.size(),x.data()); }
+inline void quicksort( std::vector<T> &x )
+{
+    quicksort( (int) x.size(), x.data() );
+}
 static inline int find_id( int, const ThreadPool::thread_id_t *, const ThreadPool::thread_id_t & );


@@ -130,6 +143,10 @@ static size_t rand_size_t()
 // We store the indicies to the queue list as short ints
 #error MAX_QUEUED must < 65535
 #endif
+// Check the c++ std
+#if CXX_STD == 98
+#error Thread pool class requires c++11 or newer
+#endif


 /******************************************************************
@@ -163,7 +180,8 @@ static inline bool get_bit( const volatile AtomicOperations::int64_atomic *x, si
 {
    uint64_t mask = 0x01;
    mask <<= index % 64;
-    AtomicOperations::int64_atomic y = x[index / 64];   // This is thread-safe since we only care about a single bit
+    // This is thread-safe since we only care about a single bit
+    AtomicOperations::int64_atomic y = x[index / 64]; 
    return ( y & mask ) != 0;
 }

@@ -194,7 +212,17 @@ static inline int count_bits( int_type x )


 /******************************************************************
-* Set the bahvior of OS warnings                                  *
+ * Set the global constants                                        *
+ ******************************************************************/
+constexpr int ThreadPool::MAX_NUM_THREADS;
+constexpr int ThreadPool::MAX_QUEUED;
+constexpr int ThreadPool::MAX_WAIT;
+constexpr bool ThreadPool::PROFILE_THREADPOOL_PERFORMANCE;
+constexpr bool ThreadPool::MONITOR_THREADPOOL_PERFORMANCE;
+
+
+/******************************************************************
+ * Set the behavior of OS warnings                                 *
 ******************************************************************/
 static int global_OS_behavior = 0;
 std::mutex OS_warning_mutex;
@@ -213,7 +241,10 @@ static void OS_warning( const std::string &message )
    }
    OS_warning_mutex.unlock();
 }
-
+void ThreadPool::setErrorHandler( std::function<void( const std::string & )> fun )
+{
+    d_errorHandler = fun;
+}

 /******************************************************************
 * Function to return the number of processors availible           *
@@ -517,7 +548,7 @@ void ThreadPool::check_startup( size_t size0 )
    ThreadPool::thread_id_t id;
    if ( id.getPriority() != -128 )
        pass = false;
-    id.reset( 3, 564, NULL );
+    id.reset( 3, 564, nullptr );
    if ( id.getPriority() != 3 || id.getLocalID() != 564 )
        pass = false;
    if ( count_bits( 0x0 ) != 0 || count_bits( 0x03 ) != 2 )
@@ -530,8 +561,10 @@ void ThreadPool::check_startup( size_t size0 )
        if ( is_odd8( ~( (size_t) 0 ) ) || !is_odd8( thread_id_t::maxThreadID ) )
            pass = false;
        for ( size_t i = 0; i < 1024; i++ ) {
-            if ( ( count_bits( thread_id_t::maxThreadID - i ) % 2 == 1 ) != is_odd8( thread_id_t::maxThreadID - i ) ) {
-                printp( "%i %i %s\n", count_bits( thread_id_t::maxThreadID - i ), is_odd8( thread_id_t::maxThreadID - i ) ? 1 : 0,
+            if ( ( count_bits( thread_id_t::maxThreadID - i ) % 2 == 1 ) !=
+                 is_odd8( thread_id_t::maxThreadID - i ) ) {
+                printp( "%i %i %s\n", count_bits( thread_id_t::maxThreadID - i ),
+                    is_odd8( thread_id_t::maxThreadID - i ) ? 1 : 0,
                    std::bitset<64>( thread_id_t::maxThreadID - i ).to_string().c_str() );
                pass = false;
            }
@@ -566,11 +599,12 @@ void ThreadPool::initialize( const int N, const char *affinity, int N_procs, con
    d_N_added       = 0;
    d_N_started     = 0;
    d_N_finished    = 0;
+    d_max_wait_time = 600;
    memset( (void *) d_active, 0, MAX_NUM_THREADS / 8 );
    memset( (void *) d_cancel, 0, MAX_NUM_THREADS / 8 );
    d_wait_last = nullptr;
-    for ( int i     = 0; i < MAX_WAIT; i++ )
-        d_wait[i]   = nullptr;
+    for ( auto &i : d_wait )
+        i = nullptr;
    // Initialize the id
    d_id_assign = thread_id_t::maxThreadID;
    // Create the threads
@@ -583,10 +617,10 @@ void ThreadPool::initialize( const int N, const char *affinity, int N_procs, con
 ******************************************************************/
 ThreadPool::~ThreadPool()
 {
-    if ( !is_valid( this ) ) {
-        std::cerr << "Thread pool is not valid\n";
-        std::terminate();
-    }
+    DISABLE_WARNINGS
+    if ( !is_valid( this ) )
+        throw std::logic_error( "Thread pool is not valid" );
+    ENABLE_WARNINGS
    // Destroy the threads
    setNumThreads( 0 );
    // Delete all remaining data
@@ -598,9 +632,8 @@ ThreadPool::~ThreadPool()
    // Print the performance metrics
    printp( "ThreadPool Performance:\n" );
    printp( "add_work:  %lu us,  %lu us,  %lu us,  %lu us,  %lu us\n",
-        total_add_work_time[0]/1000, total_add_work_time[1]/1000,
-        total_add_work_time[2]/1000, total_add_work_time[3]/1000,
-        total_add_work_time[4]/1000 );
+        total_add_work_time[0] / 1000, total_add_work_time[1] / 1000, total_add_work_time[2] / 1000,
+        total_add_work_time[3] / 1000, total_add_work_time[4] / 1000 );
 #endif
 }

@@ -643,8 +676,8 @@ void ThreadPool::setNumThreads(
    int d_N_threads_diff = num_worker_threads - d_N_threads;
    if ( d_N_threads_diff > 0 ) {
        // Check that no threads are in the process of being deleted
-        for ( int i = 0; i < MAX_NUM_THREADS / 64; i++ ) {
-            if ( d_cancel[i] != 0 )
+        for ( long i : d_cancel ) {
+            if ( i != 0 )
                throw std::logic_error(
                    "Threads are being created and destroyed at the same time" );
        }
@@ -670,11 +703,11 @@ void ThreadPool::setNumThreads(
            j++;
        }
        // Wait for all of the threads to finish initialization
-        while ( 1 ) {
+        while ( true ) {
            std::this_thread::sleep_for( std::chrono::milliseconds( 25 ) );
            bool wait = false;
-            for ( int i = 0; i < MAX_NUM_THREADS / 64; i++ ) {
-                if ( d_cancel[i] != 0 )
+            for ( long i : d_cancel ) {
+                if ( i != 0 )
                    wait = true;
            }
            if ( !wait )
@@ -752,7 +785,7 @@ void ThreadPool::setNumThreads(
            }
        } else {
            // There are fewer cpus than threads, threads will share a processor
-            int N_threads_proc =
+            auto N_threads_proc =
                static_cast<int>( ( cpus.size() + d_N_threads - 1 ) / cpus.size() );
            for ( int i = 0; i < d_N_threads; i++ )
                t_procs[i].push_back( cpus[i / N_threads_proc] );
@@ -797,8 +830,8 @@ void ThreadPool::tpool_thread( int thread_id )
        try {
            std::vector<int> cpus = ThreadPool::getProcessAffinity();
            printp( "%i cpus for current thread: ", (int) cpus.size() );
-            for ( size_t i = 0; i < cpus.size(); i++ )
-                printp( "%i ", cpus[i] );
+            for ( int cpu : cpus )
+                printp( "%i ", cpu );
            printp( "\n" );
        } catch ( ... ) {
            printp( "Unable to get process affinity\n" );
@@ -811,7 +844,8 @@ void ThreadPool::tpool_thread( int thread_id )
        // Check if there is work to do
        if ( d_queue_list.size() > 0 ) {
            // Get next work item to process
-            auto work_id = d_queue_list.remove( []( const thread_id_t& id ) { return id.ready(); } );
+            auto work_id =
+                d_queue_list.remove( []( const thread_id_t &id ) { return id.ready(); } );
            if ( work_id.isNull() ) {
                std::this_thread::yield();
                continue;
@@ -821,14 +855,28 @@ void ThreadPool::tpool_thread( int thread_id )
            // Start work here
            PROFILE_THREADPOOL_START( "thread working" );
            work->d_state = 2;
+            if ( d_errorHandler ) {
+                try {
                    work->run();
+                } catch ( std::exception &e ) {
+                    auto msg = Utilities::stringf(
+                        "Error, caught exception in thread %i:\n  %s\n", thread_id, e.what() );
+                    d_errorHandler( msg );
+                } catch ( ... ) {
+                    auto msg = Utilities::stringf(
+                        "Error, caught unknown exception in thread %i\n", thread_id );
+                    d_errorHandler( msg );
+                }
+            } else {
+                work->run();
+            }
            work->d_state = 3;
            PROFILE_THREADPOOL_STOP( "thread working" );
            AtomicOperations::atomic_increment( &d_N_finished );
            // Check if any threads are waiting on the current work item
            // This can be done without blocking
-            for ( int i = 0; i < MAX_WAIT; i++ ) {
-                const wait_ids_struct *wait = const_cast<const wait_ids_struct *>(d_wait[i]);
+            for ( auto &i : d_wait ) {
+                auto wait = AtomicOperations::atomic_get( &i );
                if ( wait != nullptr )
                    wait->id_finished( work_id );
            }
@@ -878,7 +926,8 @@ inline void ThreadPool::add_work( const ThreadPool::thread_id_t& id )
        const auto &id1 = work->d_ids[i];
        if ( !id1.started() && id1 < id ) {
            // Remove and add the id back with a higher priority
-            auto id2 = d_queue_list.remove( []( const thread_id_t& a, const thread_id_t& b ) { return a==b; }, id1 );
+            auto id2 = d_queue_list.remove(
+                []( const thread_id_t &a, const thread_id_t &b ) { return a == b; }, id1 );
            id2.setPriority( std::max( priority, id2.getPriority() ) );
            d_queue_list.insert( id2 );
        }
@@ -926,7 +975,7 @@ void ThreadPool::add_work(
    }
    // Wait for enough room in the queue (doesn't need blocking since it isn't that precise)
    if ( N > static_cast<size_t>( MAX_QUEUED - d_queue_list.size() ) ) {
-        int N_wait = static_cast<int>( N - ( MAX_QUEUED - d_queue_list.size() ) );
+        auto N_wait = static_cast<int>( N - ( MAX_QUEUED - d_queue_list.size() ) );
        while ( N_wait > 0 ) {
            d_signal_count = static_cast<unsigned char>( std::min( N_wait, 255 ) );
            d_wait_finished.wait_for( 1e-4 );
@@ -964,17 +1013,6 @@ void ThreadPool::add_work(
 }


-/******************************************************************
-* This function removes a finished work item                      *
-******************************************************************/
-ThreadPool::WorkItem *ThreadPool::getFinishedWorkItem( ThreadPool::thread_id_t id ) const
-{
-    if ( id.finished() )
-        return id.work();
-    return nullptr;
-}
-
-
 /******************************************************************
 * This function waits for a some of the work items to finish      *
 ******************************************************************/
@@ -1027,7 +1065,7 @@ int ThreadPool::wait_some(
        throw std::logic_error( "Internal error: failed to wait" );
    // Delete the wait event struct
    // Note: we want to maintain the reference in case a thread is still using it
-    // Note: technically this should be atomic
+    // Note: technically this should be atomic, but it really isn't necessary here
    std::swap( d_wait_last, tmp );
    delete tmp;
    return N_finished;
@@ -1037,23 +1075,25 @@ int ThreadPool::wait_some(
 /******************************************************************
 * This function waits for all of the threads to finish their work *
 ******************************************************************/
-void ThreadPool::check_wait_time( std::chrono::time_point<std::chrono::high_resolution_clock>& t1 ) const
+void ThreadPool::check_wait_time(
+    std::chrono::time_point<std::chrono::high_resolution_clock> &t1 ) const
 {
    auto t2 = std::chrono::high_resolution_clock::now();
-    if ( std::chrono::duration_cast<std::chrono::seconds>(t2-t1).count() > MAX_WAIT_TIME_DEBUG ) {
-        std::cout << "Warning: Maximum wait time in ThreadPool exceeded, threads may be hung\n";
-        std::cout << "N_active: " << d_num_active << std::endl;
-        std::cout << "N_queued: " << d_queue_list.size() << std::endl;
-        std::cout << "N_added: " << d_N_added << std::endl;
-        std::cout << "N_started: " << d_N_started << std::endl;
-        std::cout << "N_finished: " << d_N_finished << std::endl;
-        std::cout << "queue.insert(): " << d_queue_list.N_insert() << std::endl;
-        std::cout << "queue.remove(): " << d_queue_list.N_remove() << std::endl;
-        std::cout << "Stack Trace:\n";
+    if ( std::chrono::duration_cast<std::chrono::seconds>( t2 - t1 ).count() > d_max_wait_time ) {
+        pout << "Warning: Maximum wait time in ThreadPool exceeded, threads may be hung\n";
+        pout << "N_active: " << d_num_active << std::endl;
+        pout << "N_queued: " << d_queue_list.size() << std::endl;
+        pout << "N_added: " << d_N_added << std::endl;
+        pout << "N_started: " << d_N_started << std::endl;
+        pout << "N_finished: " << d_N_finished << std::endl;
+        pout << "queue.insert(): " << d_queue_list.N_insert() << std::endl;
+        pout << "queue.remove(): " << d_queue_list.N_remove() << std::endl;
+        pout << "Stack Trace:\n";
        auto call_stack = StackTrace::getAllCallStacks();
+        StackTrace::cleanupStackTrace( call_stack );
        auto text = call_stack.print( "  " );
        for ( auto &line : text )
-            std::cout << line << std::endl;
+            pout << line << std::endl;
        t1 = std::chrono::high_resolution_clock::now();
    }
 }
@@ -1077,12 +1117,10 @@ void ThreadPool::wait_pool_finished() const
 /******************************************************************
 * Member functions of wait_ids_struct                             *
 ******************************************************************/
-ThreadPool::wait_ids_struct::wait_ids_struct( size_t N, const ThreadPool::thread_id_t *ids, size_t N_wait,
-    AtomicOperations::pool<condition_variable,128>& cv_pool, int N_wait_list, volatile wait_ids_struct **list ):
-    d_wait( N_wait ),
-    d_N(0),
-    d_cv_pool( cv_pool ),
-    d_wait_event( cv_pool.get() )
+ThreadPool::wait_ids_struct::wait_ids_struct( size_t N, const ThreadPool::thread_id_t *ids,
+    size_t N_wait, AtomicOperations::pool<condition_variable, 128> &cv_pool, int N_wait_list,
+    volatile wait_ids_struct **list )
+    : d_wait( N_wait ), d_N( 0 ), d_cv_pool( cv_pool ), d_wait_event( cv_pool.get() )
 {
    d_ids = new ThreadPool::thread_id_t[N];
    for ( size_t i = 0; i < N; i++ ) {
@@ -1095,9 +1133,18 @@ ThreadPool::wait_ids_struct::wait_ids_struct( size_t N, const ThreadPool::thread
    d_finished = new bool[d_N];
    memset( (void *) d_finished, 0, d_N );
    int i = 0;
-    while ( !AtomicOperations::atomic_compare_and_swap( (void *volatile *) &list[i], nullptr, this ) ) { i = (i+1)%N_wait_list; }
+    while (
+        !AtomicOperations::atomic_compare_and_swap( (void *volatile *) &list[i], nullptr, this ) ) {
+        i = ( i + 1 ) % N_wait_list;
+    }
    d_ptr = &list[i];
 }
+ThreadPool::wait_ids_struct::~wait_ids_struct()
+{
+    d_cv_pool.put( d_wait_event );
+    delete[] d_finished;
+    delete[] d_ids;
+}
 void ThreadPool::wait_ids_struct::id_finished( const ThreadPool::thread_id_t &id ) const
 {
    int index = find_id( d_N, d_ids, id );
@@ -1107,9 +1154,10 @@ void ThreadPool::wait_ids_struct::id_finished( const ThreadPool::thread_id_t& id
        for ( int i = 0; i < d_N; i++ )
            N_finished += d_finished[i] ? 1 : 0;
        if ( N_finished >= d_wait ) {
-            *d_ptr = nullptr;
-            d_wait = 0;
            d_N    = 0;
+            d_wait = 0;
+            AtomicOperations::atomic_compare_and_swap(
+                (void *volatile *) d_ptr, (void *) *d_ptr, nullptr );
            d_wait_event->notify_all();
        }
    }
@@ -1132,7 +1180,8 @@ bool ThreadPool::wait_ids_struct::wait_for( double seconds )
            break;
        }
        auto t2 = std::chrono::high_resolution_clock::now();
-        if ( 1e-6*std::chrono::duration_cast<std::chrono::microseconds>(t2-t1).count() > seconds )
+        if ( 1e-6 * std::chrono::duration_cast<std::chrono::microseconds>( t2 - t1 ).count() >
+             seconds )
            return false;
        d_wait_event->wait_for( 1e-5 );
    }
@@ -1154,7 +1203,7 @@ void quicksort( int n, T *arr )
    jstack = 0;
    l      = 0;
    ir     = n - 1;
-    while ( 1 ) {
+    while ( true ) {
        if ( ir - l < 7 ) { // Insertion sort when subarray small enough.
            for ( j = l + 1; j <= ir; j++ ) {
                a    = arr[j];
@@ -1292,7 +1341,7 @@ void ThreadPool::WorkItem::add_dependencies( size_t N, const ThreadPool::thread_
            const_cast<thread_id_t &>( ids[i] ).swap( tmp[i] );
        delete[] tmp;
        d_size     = N2;
-        int* lock = reinterpret_cast<int*>(&d_ids[d_size-1]);
+        auto *lock = reinterpret_cast<int *>( &d_ids[d_size - 1] );
        *lock      = 0;
    }
    const ThreadPool::thread_id_t id0;
--- a/threadpool/thread_pool.h
+++ b/threadpool/thread_pool.h
@@ -3,53 +3,25 @@
 // PARTICULAR PURPOSE.
 #ifndef included_AtomicModelThreadPool
 #define included_AtomicModelThreadPool
+
+#include <condition_variable>
 #include <iostream>
 #include <map>
+#include <mutex>
 #include <stdarg.h>
 #include <stdexcept>
 #include <stdio.h>
 #include <string.h>
+#include <thread>
 #include <typeinfo>
 #include <vector>
-#include <mutex>
-#include <thread>
-#include <condition_variable>


 #include "threadpool/atomic_helpers.h"
 #include "threadpool/atomic_list.h"


-// Choose the OS
-#if defined( WIN32 ) || defined( _WIN32 ) || defined( WIN64 ) || defined( _WIN64 )
-    // Using windows
-    #define USE_WINDOWS
-#elif defined( __APPLE__ )
-    // Using MAC
-    #define USE_MAC
-#elif defined( __linux ) || defined( __unix ) || defined( __posix )
-    // Using linux
-    #define USE_LINUX
-#else
-    #error Unknown OS
-#endif
-
-
-// Set some definitions
-#define MAX_NUM_THREADS 128     // The maximum number of threads (must be a multiple of 64)
-#define MAX_QUEUED 1024         // The maximum number of items in the work queue at any moment
-#define MAX_WAIT 16             // The maximum number of active waits at any given time
-#define MAX_WAIT_TIME_DEBUG 600 // The maximum time in a wait command before printing a warning message
-
-#define PROFILE_THREADPOOL_PERFORMANCE 0    // Add profile timers to the threadpool (default is 0)
-#define MONITOR_THREADPOOL_PERFORMANCE 0    // Add detailed performance counters (default is 0)
-
-
-// Check the c++ std
-#if CXX_STD==98
-#error Thread pool class requires c++11 or newer
-#endif
-
+// clang-format off


 /** \class ThreadPool
@@ -75,6 +47,13 @@
 */
 class ThreadPool
 {
+public:
+    ///// Set some global properties
+    constexpr static int MAX_NUM_THREADS = 128; // The maximum number of threads (must be a multiple of 64)
+    constexpr static int MAX_QUEUED = 1024;     // The maximum number of items in the work queue at any moment
+    constexpr static int MAX_WAIT = 16;         // The maximum number of active waits at any given time
+    constexpr static bool PROFILE_THREADPOOL_PERFORMANCE = false; // Add profile timers to the threadpool
+    constexpr static bool MONITOR_THREADPOOL_PERFORMANCE = false; // Add detailed performance counters

 public:
    ///// Member classes
@@ -102,7 +81,7 @@ public:
        inline thread_id_t( volatile thread_id_t &&rhs );
        inline thread_id_t &operator=( const thread_id_t &rhs ) volatile;
        inline thread_id_t &operator=( volatile thread_id_t &&rhs ) volatile;
-#ifndef USE_WINDOWS
+#if !defined( WIN32 ) && !defined( _WIN32 ) && !defined( WIN64 ) && !defined( _WIN64 )
        inline thread_id_t( const thread_id_t &rhs );
        inline thread_id_t &operator=( thread_id_t &&rhs );
        inline thread_id_t &operator=( const thread_id_t &rhs );
@@ -245,7 +224,7 @@ public:
        //! Run the work item
        virtual void run() override = 0;
        //! Will the routine return a result
-        virtual bool has_result() const override = 0;
+        virtual bool has_result() const override final { return !std::is_same<return_type,void>::value; }
        //! Return the results
        return_type get_results() const { return d_result; }
        //! Virtual destructor
@@ -353,10 +332,12 @@ public:
     *   in the ThreadPool without checking the existing work unless the desired number of
     *   threads is 0.  In this case, the function will wait for all work items to finish
     *   before deleting the existing work threads.
+
     *   Member threads may not call this function.
     * @param N                 The desired number of worker threads
     * @param affinity          The affinity scheduler to use:
     *                          none - Let the OS handle the affinities (default)
+
     *                          independent - Give each thread an independent set of processors
     * @param procs             The processors to use (defaults to the process affinitiy list)
     */
@@ -368,6 +349,16 @@ public:
    }


+    /*!
+     * \brief   Function to set the maximum wait time
+     * \details  This function sets the maximum time the thread pool will
+     *    wait before warning about a possible hung thread.
+     *    Default is to wait 10 minutes.
+     * @param time              The number of seconds to wait (seconds)
+     */
+    inline void setMaxWaitTimeDebug( const int time ) { d_max_wait_time = time; }
+
+
    /*!
     * \brief   Function to return the current thread number
     * \details  This function will return the thread number of current active thread.
@@ -400,16 +391,14 @@ public:
     * @param id                The id of the work item
     */
    template <class return_type>
-    inline return_type getFunctionRet( const thread_id_t &id ) const;
+    static inline return_type getFunctionRet( const thread_id_t &id );


    /*!
     * \brief   Function to create a work item
     * \details This function creates a work item that can be added to the queue
-     * @param work              Pointer to the work item to add
-     *                          Note that the threadpool will automatically destroy the item when
-     * finished
-     * @param priority          A value indicating the priority of the work item (0-default)
+     * @param routine           Function to call from the thread pool
+     * @param args              Function arguments to pass
     */
    template <class Ret, class... Args>
    static inline WorkItem* createWork( Ret( *routine )( Args... ), Args... args );
@@ -505,6 +494,7 @@ public:
     *   If successful it returns the indicies of the finished work items (the index in the array ids).
     *   Note: any thread may call this routine, but they will block until finished.
     *   For worker threads this may eventually lead to a deadlock.
+     * @param N_wait            Number of work items to wait for
     * @param ids               Vector of work items to wait for
     */
    inline std::vector<int> wait_some( int N_wait, const std::vector<thread_id_t> &ids ) const;
@@ -552,6 +542,69 @@ public:
    //! Return the number of items queued
    int N_queued( ) const { return d_queue_list.size(); }

+
+    //! Set the error handler for threads
+    void setErrorHandler( std::function<void(const std::string&)> fun );
+
+
+public: // Static interface
+
+    /*!
+     * \brief   Function to return the number of work threads
+     * \details This function returns the number of threads in the thread pool,
+     *    or 0 if the thread pool is empty or does not exist
+     * @param tpool         Threadpool to add work to (may be null)
+     */
+    static inline int numThreads( const ThreadPool* tpool ) { return tpool ? tpool->getNumThreads() : 0; }
+
+    /*!
+     * \brief   Function to add a work item
+     * \details This function adds a work item to the queue
+     *   Note: any thread may call this routine.
+     * @param tpool         Threadpool to add work to (may be null)
+     * @param work          Pointer to the work item to add
+     *                      Note that the threadpool will automatically destroy the item when finished
+     * @param priority      A value indicating the priority of the work item (0-default)
+     */
+    static inline thread_id_t add_work( ThreadPool* tpool, ThreadPool::WorkItem *work, int priority = 0 );
+
+
+    /*!
+     * \brief   Function to add multiple work items
+     * \details This function adds multiple work item to the queue
+     *   Note: any thread may call this routine.
+     * @param tpool         Threadpool to add work to (may be null)
+     * @param work          Vector of pointers to the work items to add
+     *                      Note that the threadpool will automatically destroy the item when finished
+     * @param priority      Vector of values indicating the priority of the work items
+     */
+    static inline std::vector<thread_id_t> add_work( ThreadPool* tpool, const std::vector<ThreadPool::WorkItem *> &work,
+        const std::vector<int> &priority = std::vector<int>() );
+
+
+    /*!
+     * \brief   Function to wait until all of the given work items have finished their work
+     * \details This is the function waits for all given of the work items to finish.  It returns 0
+     * if successful.
+     *   Note: any thread may call this routine, but they will block until finished.
+     *   For worker threads this may eventually lead to a deadlock.
+     * @param tpool         Threadpool containing work (must match call to add_work)
+     * @param ids           Vector of work items to wait for
+     */
+    static inline int wait_all( const ThreadPool* tpool, const std::vector<thread_id_t> &ids );
+
+
+    /*!
+     * \brief   Function to wait until all work items in the thread pool have finished their work
+     * \details This function will wait until all work has finished.
+     *   Note: member threads may not call this function.
+     *   Only one non-member thread should call this routine at a time.
+     * @param tpool         Threadpool containing work (must match call to add_work)
+     */
+    static inline void wait_pool_finished( const ThreadPool* tpool ) { if ( tpool ) { tpool->wait_pool_finished(); } }
+
+
+
 private:
    typedef AtomicOperations::int32_atomic int32_atomic;

@@ -593,7 +646,7 @@ private:
      public:
        wait_ids_struct( size_t N, const ThreadPool::thread_id_t *ids, size_t N_wait,
            AtomicOperations::pool<condition_variable,128>& cv_pool, int N_wait_list, volatile wait_ids_struct **list );
-        ~wait_ids_struct( ) { d_cv_pool.put( d_wait_event ); delete [] d_finished; delete [] d_ids; }
+        ~wait_ids_struct( );
        void id_finished( const ThreadPool::thread_id_t& id ) const;
        bool wait_for( double seconds );
      private:
@@ -628,7 +681,10 @@ private:
    inline void add_work( const ThreadPool::thread_id_t& id );

    // Function to get a work item that has finished
-    WorkItem *getFinishedWorkItem( ThreadPool::thread_id_t id ) const;
+    static inline WorkItem *getFinishedWorkItem( const ThreadPool::thread_id_t& id )
+    {
+        return id.finished() ? id.work():nullptr;
+    }

    // This function provides a wrapper (needed for the threads)
    static inline void create_new_thread( ThreadPool *tpool, int id )
@@ -676,10 +732,13 @@ private:
    std::thread::id d_threadId[MAX_NUM_THREADS]; // Unique id for each thread
    queue_type d_queue_list;                // The work queue
    size_t d_NULL_TAIL;                     // Null data buffer to check memory bounds
+    int d_max_wait_time;                    // The maximum time in a wait command before printing a warning message
+    std::function<void(const std::string&)> d_errorHandler;
 };


 #include "threadpool/thread_pool.hpp"


+// clang-format on
 #endif
--- a/threadpool/thread_pool.hpp
+++ b/threadpool/thread_pool.hpp
@@ -23,7 +23,7 @@
 */
 #define TPOOL_TUPLE_TO_SEQ( t ) TPOOL_TUPLE_TO_SEQ_##II t
 #define TPOOL_TUPLE_TO_SEQ_II( a, ... ) a, ##__VA_ARGS__
-#ifdef USE_WINDOWS
+#if defined( WIN32 ) || defined( _WIN32 ) || defined( WIN64 ) || defined( _WIN64 )
 #define TPOOL_GET_PRIORITY( a, N, c, ... ) N
 #define TPOOL_ADD_WORK( TPOOL, FUNCTION, ARGS, ... )                                      \
    ThreadPool_add_work( TPOOL, TPOOL_GET_PRIORITY( 0, __VA_ARGS__, 0, 0 ) + 0, FUNCTION, \
@@ -40,7 +40,6 @@
 // \cond HIDDEN_SYMBOLS


-
 // Unpack a tuple and call a function
 template<int...>
 struct index_tuple {
@@ -59,7 +58,8 @@ template <typename... Types>
 struct make_indexes : make_indexes_impl<0, index_tuple<>, Types...> {
 };
 template<class Ret, class... Args, int... Indexes>
-inline Ret apply_helper( Ret ( *pf )( Args... ), index_tuple<Indexes...>, std::tuple<Args...> &&tup )
+inline Ret apply_helper(
+    Ret ( *pf )( Args... ), index_tuple<Indexes...>, std::tuple<Args...> &&tup )
 {
    return pf( std::forward<Args>( std::get<Indexes>( tup ) )... );
 }
@@ -82,9 +82,9 @@ class ThreadPool::WorkItemRet<void> : public ThreadPool::WorkItem
 {
 public:
    virtual void run() override = 0;
-    virtual bool has_result() const override { return false; }
    void get_results() {}
    virtual ~WorkItemRet() {}
+    virtual bool has_result() const override final { return false; }
 };


@@ -104,11 +104,7 @@ public:
        : ThreadPool::WorkItemRet<void>(), routine( routine2 ), args( ts... )
    {
    }
-    virtual void run() override
-    {
-        apply( routine, args );
-    }
-    virtual bool has_result() const override { return false; }
+    virtual void run() override { apply( routine, args ); }
    virtual ~WorkItemFull() {}
 };
 template<class Ret, class... Args>
@@ -124,11 +120,7 @@ public:
        : ThreadPool::WorkItemRet<Ret>(), routine( routine2 ), args( ts... )
    {
    }
-    virtual void run() override
-    {
-        this->d_result = apply( routine, args );
-    }
-    virtual bool has_result() const override { return true; }
+    virtual void run() override { this->d_result = apply( routine, args ); }
    virtual ~WorkItemFull() {}
 };

@@ -138,15 +130,15 @@ template <class Ret, class... Ts>
 inline ThreadPool::thread_id_t ThreadPool_add_work(
    ThreadPool *tpool, int priority, Ret ( *routine )( Ts... ), Ts... ts )
 {
-    ThreadPool::WorkItem *work = new WorkItemFull<Ret, Ts...>( routine, ts... );
-    return tpool->add_work( work, priority );
+    auto work = new WorkItemFull<Ret, Ts...>( routine, ts... );
+    return ThreadPool::add_work( tpool, work, priority );
 }
 template<class Ret>
 inline ThreadPool::thread_id_t ThreadPool_add_work(
    ThreadPool *tpool, int priority, Ret ( *routine )(), void * )
 {
-    ThreadPool::WorkItem *work = new WorkItemFull<Ret>( routine );
-    return tpool->add_work( work, priority );
+    auto work = new WorkItemFull<Ret>( routine );
+    return ThreadPool::add_work( tpool, work, priority );
 }
 template<class Ret, class... Args>
 inline ThreadPool::WorkItem *ThreadPool::createWork( Ret ( *routine )( Args... ), Args... args )
@@ -158,6 +150,7 @@ inline ThreadPool::WorkItem* ThreadPool::createWork( Ret( *routine )( Args... ),
 /******************************************************************
 * Function to get the returned function value                     *
 ******************************************************************/
+// clang-format off
 template<class T> inline constexpr T zeroConstructor();
 template<> inline constexpr bool zeroConstructor<bool>() { return false; }
 template<> inline constexpr char zeroConstructor<char>() { return 0; }
@@ -170,11 +163,12 @@ template<> inline constexpr float zeroConstructor<float>( ) { return 0; }
 template<> inline constexpr double zeroConstructor<double>() { return 0; }
 template<class T> inline constexpr T zeroConstructor() { return T(); }
 template<class Ret>
-inline Ret ThreadPool::getFunctionRet( const ThreadPool::thread_id_t &id ) const
+inline Ret ThreadPool::getFunctionRet( const ThreadPool::thread_id_t &id )
 {
-    WorkItemRet<Ret> *work = dynamic_cast<WorkItemRet<Ret>*>( getFinishedWorkItem( id ) );
+    auto work = dynamic_cast<WorkItemRet<Ret> *>( getFinishedWorkItem( id ) );
    return work == nullptr ? zeroConstructor<Ret>() : work->get_results();
 }
+// clang-format on


 /******************************************************************
@@ -234,7 +228,14 @@ inline int ThreadPool::wait_all( const std::vector<thread_id_t> &ids ) const
    delete[] finished;
    return 0;
 }
-inline std::vector<int> ThreadPool::wait_some( int N_wait, const std::vector<thread_id_t> &ids ) const
+inline int ThreadPool::wait_all( const ThreadPool *tpool, const std::vector<thread_id_t> &ids )
+{
+    if ( tpool )
+        return tpool->wait_all( ids );
+    return ids.size();
+}
+inline std::vector<int> ThreadPool::wait_some(
+    int N_wait, const std::vector<thread_id_t> &ids ) const
 {
    auto finished  = new bool[ids.size()];
    int N_finished = wait_some( ids.size(), ids.data(), N_wait, finished );
@@ -280,6 +281,32 @@ inline std::vector<ThreadPool::thread_id_t> ThreadPool::add_work(
        delete[] priority2;
    return ids;
 }
+inline ThreadPool::thread_id_t ThreadPool::add_work(
+    ThreadPool *tpool, ThreadPool::WorkItem *work, int priority )
+{
+    ThreadPool::thread_id_t id;
+    if ( tpool ) {
+        id = tpool->add_work( work, priority );
+    } else {
+        id.reset( priority, std::rand(), work );
+        work->d_state = 2;
+        work->run();
+        work->d_state = 3;
+    }
+    return id;
+}
+inline std::vector<ThreadPool::thread_id_t> ThreadPool::add_work( ThreadPool *tpool,
+    const std::vector<ThreadPool::WorkItem *> &work, const std::vector<int> &priority )
+{
+    if ( tpool ) {
+        return tpool->add_work( work, priority );
+    } else {
+        std::vector<ThreadPool::thread_id_t> ids( work.size() );
+        for ( size_t i = 0; i < work.size(); i++ )
+            ids[i] = add_work( tpool, work[i], priority[i] );
+        return ids;
+    }
+}


 /******************************************************************
@@ -326,7 +353,7 @@ inline ThreadPool::thread_id_t::thread_id_t( const volatile ThreadPool::thread_i
    if ( d_count != NULL )
        AtomicOperations::atomic_increment( d_count );
 }
-#ifndef USE_WINDOWS
+#if !defined( WIN32 ) && !defined( _WIN32 ) && !defined( WIN64 ) && !defined( _WIN64 )
 inline ThreadPool::thread_id_t::thread_id_t( const thread_id_t &rhs )
    : d_id( rhs.d_id ), d_count( rhs.d_count ), d_work( rhs.d_work )
 {
@@ -488,7 +515,8 @@ inline bool ThreadPool::isValid( const ThreadPool::thread_id_t &id ) const
    static_assert( sizeof( atomic_64 ) == 8, "atomic_64 must be a 64-bit integer" );
    uint64_t local_id = id.getLocalID();
    uint64_t next_id  = d_id_assign - 1;
-    return local_id!=0 && id.initialized() && local_id<=thread_id_t::maxThreadID && local_id>next_id;
+    return local_id != 0 && id.initialized() && local_id <= thread_id_t::maxThreadID &&
+           local_id > next_id;
 }
				`@@ -27,4 +27,3 @@ int atomic_pthread_lock_initialized = create_atomic_pthread_lock();`

				`} // AtomicOperations namespace`