diff --git a/common/Array.h b/common/Array.h
index f926b101..edfa687a 100644
--- a/common/Array.h
+++ b/common/Array.h
@@ -1,34 +1,15 @@
 #ifndef included_ArrayClass
 #define included_ArrayClass
 
-#include <vector>
 #include <array>
+#include <cstring>
 #include <functional>
+#include <initializer_list>
 #include <iostream>
-#include <stdexcept>
 #include <memory>
-#include <iostream>
+#include <vector>
 
-
-#define ARRAY_NDIM_MAX 5 // Maximum number of dimensions supported
-
-
-#define GET_ARRAY_INDEX3D( N, i1, i2, i3 ) i1 + N[0] * ( i2 + N[1] * i3 )
-#define GET_ARRAY_INDEX4D( N, i1, i2, i3, i4 ) i1 + N[0] * ( i2 + N[1] * ( i3 + N[2] * i4 ) )
-#define GET_ARRAY_INDEX5D( N, i1, i2, i3, i4, i5 ) i1 + N[0] * ( i2 + N[1] * ( i3 + N[2] * ( i4 + N[3] * i5 ) ) )
-
-#if defined( DEBUG ) || defined( _DEBUG )
-    #define CHECK_ARRAY_INDEX3D( N, i1, i2, i3 )                  \
-        if ( GET_ARRAY_INDEX3D( N, i1, i2, i3 ) < 0 || GET_ARRAY_INDEX3D( N, i1, i2, i3 ) >= d_length ) \
-            throw std::logic_error( "Index exceeds array bounds" );
-    #define CHECK_ARRAY_INDEX4D( N, i1, i2, i3, i4 )              \
-        if ( GET_ARRAY_INDEX4D( N, i1, i2, i3, i4 ) < 0 ||        \
-             GET_ARRAY_INDEX4D( N, i1, i2, i3, i4 ) >= d_length ) \
-            throw std::logic_error( "Index exceeds array bounds" );
-#else
-    #define CHECK_ARRAY_INDEX3D( N, i1, i2, i3 )
-    #define CHECK_ARRAY_INDEX4D( N, i1, i2, i3, i4 )
-#endif
+#include "Utilities.h"
 
 
 #if defined( __CUDA_ARCH__ )
@@ -37,20 +18,244 @@
 #else
 #define HOST_DEVICE
 #endif
+#if defined( USING_GCC ) || defined( USING_CLANG )
+#define ATTRIBUTE_INLINE __attribute__( ( always_inline ) )
+#else
+#define ATTRIBUTE_INLINE
+#endif
+
+
+#if ( defined( DEBUG ) || defined( _DEBUG ) ) && !defined( NDEBUG )
+#define CHECK_ARRAY_LENGTH( i )                                      \
+    do {                                                             \
+        if ( i >= d_length )                                         \
+            throw std::length_error( "Index exceeds array bounds" ); \
+    } while ( 0 )
+#else
+#define CHECK_ARRAY_LENGTH( i ) \
+    do {                        \
+    } while ( 0 )
+#endif
+
+
+// Forward decleration
+class FunctionTable;
+
+
+//! Simple range class
+template<class TYPE = size_t>
+class Range final
+{
+public:
+    //! Empty constructor
+    Range() : i( 0 ), j( -1 ), k( 1 ) {}
+
+    /*!
+     * Create a range i:k:j (or i:j)
+     * @param i_            Starting value
+     * @param j_            Ending value
+     * @param k_            Increment value
+     */
+    Range( TYPE i_, TYPE j_, TYPE k_ = 1 ) : i( i_ ), j( j_ ), k( k_ ) {}
+
+    TYPE i, j, k;
+};
+
+
+//! Simple class to store the array dimensions
+class ArraySize final
+{
+public:
+    //! Empty constructor
+    inline ArraySize();
+
+    /*!
+     * Create the vector size
+     * @param N1            Number of elements in the first dimension
+     */
+    inline ArraySize( size_t N1 );
+
+    /*!
+     * Create the vector size
+     * @param N1            Number of elements in the first dimension
+     * @param N2            Number of elements in the second dimension
+     */
+    inline ArraySize( size_t N1, size_t N2 );
+
+    /*!
+     * Create the vector size
+     * @param N1            Number of elements in the first dimension
+     * @param N2            Number of elements in the second dimension
+     * @param N3            Number of elements in the third dimension
+     */
+    inline ArraySize( size_t N1, size_t N2, size_t N3 );
+
+    /*!
+     * Create the vector size
+     * @param N1            Number of elements in the first dimension
+     * @param N2            Number of elements in the second dimension
+     * @param N3            Number of elements in the third dimension
+     * @param N4            Number of elements in the fourth dimension
+     */
+    inline ArraySize( size_t N1, size_t N2, size_t N3, size_t N4 );
+
+    /*!
+     * Create the vector size
+     * @param N1            Number of elements in the first dimension
+     * @param N2            Number of elements in the second dimension
+     * @param N3            Number of elements in the third dimension
+     * @param N4            Number of elements in the fourth dimension
+     * @param N5            Number of elements in the fifth dimension
+     */
+    inline ArraySize( size_t N1, size_t N2, size_t N3, size_t N4, size_t N5 );
+
+    /*!
+     * Create from initializer list
+     * @param N             Size of the array
+     */
+    inline ArraySize( std::initializer_list<size_t> N );
+
+    /*!
+     * Create from raw pointer
+     * @param ndim          Number of dimensions
+     * @param ndim          Dimensions
+     */
+    inline ArraySize( size_t ndim, const size_t *dims );
+
+    /*!
+     * Create from std::vector
+     * @param N             Size of the array
+     */
+    inline ArraySize( const std::vector<size_t> &N );
+
+    /*!
+     * Copy constructor
+     * @param rhs           Array to copy
+     */
+    inline ArraySize( const ArraySize &rhs );
+
+    /*!
+     * Move constructor
+     * @param rhs           Array to copy
+     */
+    inline ArraySize( ArraySize &&rhs );
+
+    /*!
+     * Assignment operator
+     * @param rhs           Array to copy
+     */
+    inline ArraySize &operator=( const ArraySize &rhs );
+
+    /*!
+     * Move assignment operator
+     * @param rhs           Array to copy
+     */
+    inline ArraySize &operator=( ArraySize &&rhs );
+
+    /*!
+     * Access the ith dimension
+     * @param i             Index to access
+     */
+    inline size_t operator[]( size_t i ) const { return d_N[i]; }
+
+    //! Sum the elements
+    inline uint8_t ndim() const ATTRIBUTE_INLINE { return d_ndim; }
+
+    //! Sum the elements
+    inline size_t size() const ATTRIBUTE_INLINE { return d_ndim; }
+
+    //! Sum the elements
+    inline size_t length() const ATTRIBUTE_INLINE { return d_length; }
+
+    //! Sum the elements
+    inline void resize( uint8_t dim, size_t N );
+
+    //! Returns an iterator to the beginning
+    inline const size_t *begin() const ATTRIBUTE_INLINE { return d_N; }
+
+    //! Returns an iterator to the end
+    inline const size_t *end() const ATTRIBUTE_INLINE { return d_N + d_ndim; }
+
+    // Check if two matrices are equal
+    inline bool operator==( const ArraySize &rhs ) const ATTRIBUTE_INLINE
+    {
+        return d_ndim == rhs.d_ndim && memcmp( d_N, rhs.d_N, sizeof( d_N ) ) == 0;
+    }
+
+    //! Check if two matrices are not equal
+    inline bool operator!=( const ArraySize &rhs ) const ATTRIBUTE_INLINE
+    {
+        return d_ndim != rhs.d_ndim || memcmp( d_N, rhs.d_N, sizeof( d_N ) ) != 0;
+    }
+
+    //! Maximum supported dimension
+    constexpr static uint8_t maxDim() ATTRIBUTE_INLINE { return 5u; }
+
+    //! Get the index
+    inline size_t index( size_t i ) const ATTRIBUTE_INLINE
+    {
+        CHECK_ARRAY_LENGTH( i );
+        return i;
+    }
+
+    //! Get the index
+    inline size_t index( size_t i1, size_t i2 ) const ATTRIBUTE_INLINE
+    {
+        size_t index = i1 + i2 * d_N[0];
+        CHECK_ARRAY_LENGTH( index );
+        return index;
+    }
+
+    //! Get the index
+    inline size_t index( size_t i1, size_t i2, size_t i3 ) const ATTRIBUTE_INLINE
+    {
+        size_t index = i1 + d_N[0] * ( i2 + d_N[1] * i3 );
+        CHECK_ARRAY_LENGTH( index );
+        return index;
+    }
+
+    //! Get the index
+    inline size_t index( size_t i1, size_t i2, size_t i3, size_t i4 ) const ATTRIBUTE_INLINE
+    {
+        size_t index = i1 + d_N[0] * ( i2 + d_N[1] * ( i3 + d_N[2] * i4 ) );
+        CHECK_ARRAY_LENGTH( index );
+        return index;
+    }
+
+    //! Get the index
+    inline size_t index(
+        size_t i1, size_t i2, size_t i3, size_t i4, size_t i5 ) const ATTRIBUTE_INLINE
+    {
+        size_t index = i1 + d_N[0] * ( i2 + d_N[1] * ( i3 + d_N[2] * ( i4 + d_N[3] * i5 ) ) );
+        CHECK_ARRAY_LENGTH( index );
+        return index;
+    }
+
+private:
+    uint8_t d_ndim;
+    size_t d_length;
+    size_t d_N[5];
+};
 
 
 /*!
  * Class Array is a multi-dimensional array class written by Mark Berrill
  */
-template <class TYPE>
-class Array
+template<class TYPE, class FUN = FunctionTable>
+class Array final
 {
-public:
+public: // Constructors / assignment operators
     /*!
      * Create a new empty Array
      */
     Array();
 
+    /*!
+     * Create an Array with the given size
+     * @param N             Size of the array
+     */
+    explicit Array( const ArraySize &N );
+
     /*!
      * Create a new 1D Array with the given number of elements
      * @param N             Number of elements in the array
@@ -72,6 +277,25 @@ public:
      */
     explicit Array( size_t N1, size_t N2, size_t N3 );
 
+    /*!
+     * Create a new 4D Array with the given number of rows and columns
+     * @param N1            Number of elements in the first dimension
+     * @param N2            Number of elements in the second dimension
+     * @param N3            Number of elements in the third dimension
+     * @param N4            Number of elements in the fourth dimension
+     */
+    explicit Array( size_t N1, size_t N2, size_t N3, size_t N4 );
+
+    /*!
+     * Create a new 4D Array with the given number of rows and columns
+     * @param N1            Number of elements in the first dimension
+     * @param N2            Number of elements in the second dimension
+     * @param N3            Number of elements in the third dimension
+     * @param N4            Number of elements in the fourth dimension
+     * @param N5            Number of elements in the fifth dimension
+     */
+    explicit Array( size_t N1, size_t N2, size_t N3, size_t N4, size_t N5 );
+
     /*!
      * Create a multi-dimensional Array with the given number of elements
      * @param N             Number of elements in each dimension
@@ -79,6 +303,19 @@ public:
      */
     explicit Array( const std::vector<size_t> &N, const TYPE *data = NULL );
 
+    /*!
+     * Create a 1D Array with the range
+     * @param range         Range of the data
+     */
+    explicit Array( const Range<TYPE> &range );
+
+    /*!
+     * Create a 1D Array with the given initializer list
+     * @param data          Input data
+     */
+    Array( std::initializer_list<TYPE> data );
+
+
     /*!
      * Copy constructor
      * @param rhs           Array to copy
@@ -109,7 +346,7 @@ public:
      */
     Array &operator=( const std::vector<TYPE> &rhs );
 
-
+public: // Views/copies/subset
     /*!
      * Create a 1D Array view to a raw block of data
      * @param N             Number of elements in the array
@@ -141,8 +378,7 @@ public:
      * @param N             Number of elements in each dimension
      * @param data          Pointer to the data
      */
-    static std::shared_ptr<Array> view(
-        const std::vector<size_t> &N, std::shared_ptr<TYPE> const &data );
+    static std::shared_ptr<Array> view( const ArraySize &N, std::shared_ptr<TYPE> const &data );
 
 
     /*!
@@ -178,7 +414,7 @@ public:
      * @param data          Pointer to the data
      */
     static std::shared_ptr<const Array> constView(
-        const std::vector<size_t> &N, std::shared_ptr<const TYPE> const &data );
+        const ArraySize &N, std::shared_ptr<const TYPE> const &data );
 
 
     /*!
@@ -192,7 +428,20 @@ public:
      * @param N             Number of elements in each dimension
      * @param data          Pointer to the data
      */
-    void view2( const std::vector<size_t> &N, std::shared_ptr<TYPE> const &data );
+    void view2( const ArraySize &N, std::shared_ptr<TYPE> const &data );
+
+    /*!
+     * Make this object a view of the raw data (expert use only).
+     * Use view2( N, std::shared_ptr(data,[](TYPE*){}) ) instead.
+     *   Note: this interface is not recommended as it does not protect from
+     *   the src data being deleted while still being used by the Array.
+     *   Additionally for maximum performance it does not set the internal shared_ptr
+     *   so functions like getPtr and resize will not work correctly.
+     * @param ndim          Number of dimensions
+     * @param dims          Number of elements in each dimension
+     * @param data          Pointer to the data
+     */
+    void viewRaw( int ndim, const size_t *dims, TYPE *data );
 
     /*!
      * Make this object a view of the raw data (expert use only).
@@ -204,41 +453,30 @@ public:
      * @param N             Number of elements in each dimension
      * @param data          Pointer to the data
      */
-    void viewRaw( const std::initializer_list<size_t> &N, TYPE *data );
-
-    /*!
-     * Make this object a view of the raw data (expert use only).
-     * Use view2( N, std::shared_ptr(data,[](TYPE*){}) ) instead.
-     *   Note: this interface is not recommended as it does not protect from
-     *   the src data being deleted while still being used by the Array.
-     *   Additionally for maximum performance it does not set the internal shared_ptr
-     *   so functions like getPtr and resize will not work correctly.
-     * @param N             Number of elements in each dimension
-     * @param data          Pointer to the data
-     */
-    void viewRaw( const std::vector<size_t> &N, TYPE *data );
+    void viewRaw( const ArraySize &N, TYPE *data );
 
     /*!
      * Convert an array of one type to another.  This may or may not allocate new memory.
      * @param array         Input array
      */
-    template <class TYPE2>
-    static std::shared_ptr<Array<TYPE2>> convert( std::shared_ptr<Array<TYPE>> array );
+    template<class TYPE2>
+    static std::shared_ptr<Array<TYPE2>> convert( std::shared_ptr<Array<TYPE, FUN>> array );
 
 
     /*!
      * Convert an array of one type to another.  This may or may not allocate new memory.
      * @param array         Input array
      */
-    template <class TYPE2>
-    static std::shared_ptr<const Array<TYPE2>> convert( std::shared_ptr<const Array<TYPE>> array );
+    template<class TYPE2>
+    static std::shared_ptr<const Array<TYPE2>> convert(
+        std::shared_ptr<const Array<TYPE, FUN>> array );
 
 
     /*!
      * Copy and convert data from another array to this array
      * @param array         Source array
      */
-    template <class TYPE2>
+    template<class TYPE2>
     void copy( const Array<TYPE2> &array );
 
     /*!
@@ -246,16 +484,23 @@ public:
      *    Note: The current array must be allocated to the proper size first.
      * @param array         Source array
      */
-    template <class TYPE2>
+    template<class TYPE2>
     void copy( const TYPE2 *array );
 
     /*!
      * Copy and convert data from this array to a raw vector.
      * @param array         Source array
      */
-    template <class TYPE2>
+    template<class TYPE2>
     void copyTo( TYPE2 *array ) const;
 
+    /*!
+     * Copy and convert data from this array to a raw vector.
+     * @param array         Source array
+     */
+    template<class TYPE2>
+    Array<TYPE2, FUN> cloneTo() const;
+
 
     /*!
      * Fill the array with the given value
@@ -274,7 +519,7 @@ public:
      * @param base        Base array
      * @param exp         Exponent value
      */
-    void pow( const Array<TYPE> &baseArray, const TYPE &exp );
+    void pow( const Array<TYPE, FUN> &base, const TYPE &exp );
 
     //! Destructor
     ~Array();
@@ -285,23 +530,27 @@ public:
 
 
     //! Return the size of the Array
-    inline int ndim() const { return d_ndim; }
+    inline int ndim() const { return d_size.ndim(); }
 
 
     //! Return the size of the Array
-    inline std::vector<size_t> size() const { return std::vector<size_t>( d_N, d_N + d_ndim ); }
+    inline ArraySize &size() { return d_size; }
 
 
     //! Return the size of the Array
-    inline size_t size( int d ) const { return d_N[d]; }
+    inline ArraySize size() const { return d_size; }
 
 
     //! Return the size of the Array
-    inline size_t length() const { return d_length; }
+    inline size_t size( int d ) const { return d_size[d]; }
+
+
+    //! Return the size of the Array
+    inline size_t length() const { return d_size.length(); }
 
 
     //! Return true if the Array is empty
-    inline bool empty() const { return d_length == 0; }
+    inline bool empty() const { return d_size.length() == 0; }
 
 
     /*!
@@ -329,7 +578,8 @@ public:
      * Resize the Array
      * @param N             Number of elements in each dimension
      */
-    void resize( const std::vector<size_t> &N );
+    void resize( const ArraySize &N );
+
 
     /*!
      * Resize the given dimension of the array
@@ -344,48 +594,73 @@ public:
      * Reshape the Array (total size of array will not change)
      * @param N             Number of elements in each dimension
      */
-    void reshape( const std::vector<size_t> &N );
+    void reshape( const ArraySize &N );
 
 
     /*!
      * Subset the Array (total size of array will not change)
      * @param index         Index to subset (imin,imax,jmin,jmax,kmin,kmax,...)
      */
-    template<class TYPE2=TYPE>
-    Array<TYPE2> subset( const std::vector<size_t> &index ) const;
+    template<class TYPE2 = TYPE>
+    Array<TYPE2, FUN> subset( const std::vector<size_t> &index ) const;
+
+
+    /*!
+     * Subset the Array (total size of array will not change)
+     * @param index         Index to subset (ix:kx:jx,iy:ky:jy,...)
+     */
+    template<class TYPE2 = TYPE>
+    Array<TYPE2, FUN> subset( const std::vector<Range<size_t>> &index ) const;
+
 
     /*!
      * Copy data from an array into a subset of this array
      * @param index         Index of the subset (imin,imax,jmin,jmax,kmin,kmax,...)
      * @param subset        The subset array to copy from
      */
-    template <class TYPE2>
-    void copySubset( const std::vector<size_t> &index, const Array<TYPE2> &subset );
+    template<class TYPE2>
+    void copySubset( const std::vector<size_t> &index, const Array<TYPE2, FUN> &subset );
+
+    /*!
+     * Copy data from an array into a subset of this array
+     * @param index         Index of the subset
+     * @param subset        The subset array to copy from
+     */
+    template<class TYPE2>
+    void copySubset( const std::vector<Range<size_t>> &index, const Array<TYPE2, FUN> &subset );
 
     /*!
      * Add data from an array into a subset of this array
      * @param index         Index of the subset (imin,imax,jmin,jmax,kmin,kmax,...)
      * @param subset        The subset array to add from
      */
-    void addSubset( const std::vector<size_t> &index, const Array<TYPE> &subset );
+    void addSubset( const std::vector<size_t> &index, const Array<TYPE, FUN> &subset );
+
+    /*!
+     * Add data from an array into a subset of this array
+     * @param index         Index of the subset
+     * @param subset        The subset array to add from
+     */
+    void addSubset( const std::vector<Range<size_t>> &index, const Array<TYPE, FUN> &subset );
 
 
+public: // Accessors
     /*!
      * Access the desired element
      * @param i             The row index
      */
-    HOST_DEVICE inline TYPE &operator()( size_t i )
+    HOST_DEVICE inline TYPE &operator()( size_t i ) ATTRIBUTE_INLINE
     {
-        CHECK_ARRAY_INDEX3D( d_N, i, 0, 0 ) return d_data[i];
+        return d_data[d_size.index( i )];
     }
 
     /*!
      * Access the desired element
      * @param i             The row index
      */
-    HOST_DEVICE inline const TYPE &operator()( size_t i ) const
+    HOST_DEVICE inline const TYPE &operator()( size_t i ) const ATTRIBUTE_INLINE
     {
-        CHECK_ARRAY_INDEX3D( d_N, i, 0, 0 ) return d_data[i];
+        return d_data[d_size.index( i )];
     }
 
     /*!
@@ -393,9 +668,9 @@ public:
      * @param i             The row index
      * @param j             The column index
      */
-    HOST_DEVICE inline TYPE &operator()( size_t i, size_t j )
+    HOST_DEVICE inline TYPE &operator()( size_t i, size_t j ) ATTRIBUTE_INLINE
     {
-        CHECK_ARRAY_INDEX3D( d_N, i, j, 0 ) return d_data[i + j * d_N[0]];
+        return d_data[d_size.index( i, j )];
     }
 
     /*!
@@ -403,9 +678,9 @@ public:
      * @param i             The row index
      * @param j             The column index
      */
-    HOST_DEVICE inline const TYPE &operator()( size_t i, size_t j ) const
+    HOST_DEVICE inline const TYPE &operator()( size_t i, size_t j ) const ATTRIBUTE_INLINE
     {
-        CHECK_ARRAY_INDEX3D( d_N, i, j, 0 ) return d_data[i + j * d_N[0]];
+        return d_data[d_size.index( i, j )];
     }
 
     /*!
@@ -414,9 +689,9 @@ public:
      * @param j             The column index
      * @param k             The third index
      */
-    HOST_DEVICE inline TYPE &operator()( size_t i, size_t j, size_t k )
+    HOST_DEVICE inline TYPE &operator()( size_t i, size_t j, size_t k ) ATTRIBUTE_INLINE
     {
-        CHECK_ARRAY_INDEX3D( d_N, i, j, k ) return d_data[GET_ARRAY_INDEX3D( d_N, i, j, k )];
+        return d_data[d_size.index( i, j, k )];
     }
 
     /*!
@@ -425,35 +700,109 @@ public:
      * @param j             The column index
      * @param k             The third index
      */
-    HOST_DEVICE inline const TYPE &operator()( size_t i, size_t j, size_t k ) const
+    HOST_DEVICE inline const TYPE &operator()( size_t i, size_t j, size_t k ) const ATTRIBUTE_INLINE
     {
-        CHECK_ARRAY_INDEX3D( d_N, i, j, k ) return d_data[GET_ARRAY_INDEX3D( d_N, i, j, k )];
+        return d_data[d_size.index( i, j, k )];
     }
 
     /*!
      * Access the desired element
-     * @param i             The row index
-     * @param j             The column index
-     * @param k             The third index
-     * @param l             The fourth index
+     * @param i1            The first index
+     * @param i2            The second index
+     * @param i3            The third index
+     * @param i4            The fourth index
      */
-    HOST_DEVICE inline TYPE &operator()( size_t i, size_t j, size_t k, size_t l )
+    HOST_DEVICE inline TYPE &operator()(
+        size_t i1, size_t i2, size_t i3, size_t i4 ) ATTRIBUTE_INLINE
     {
-        CHECK_ARRAY_INDEX4D( d_N, i, j, k, l ) return d_data[GET_ARRAY_INDEX4D( d_N, i, j, k, l )];
+        return d_data[d_size.index( i1, i2, i3, i4 )];
     }
 
     /*!
      * Access the desired element
-     * @param i             The row index
-     * @param j             The column index
-     * @param k             The third index
-     * @param l             The fourth index
+     * @param i1            The first index
+     * @param i2            The second index
+     * @param i3            The third index
+     * @param i4            The fourth index
      */
-    HOST_DEVICE inline const TYPE &operator()( size_t i, size_t j, size_t k, size_t l ) const
+    HOST_DEVICE inline const TYPE &operator()(
+        size_t i1, size_t i2, size_t i3, size_t i4 ) const ATTRIBUTE_INLINE
     {
-        CHECK_ARRAY_INDEX4D( d_N, i, j, k, l ) return d_data[GET_ARRAY_INDEX4D( d_N, i, j, k, l )];
+        return d_data[d_size.index( i1, i2, i3, i4 )];
     }
 
+    /*!
+     * Access the desired element
+     * @param i1            The first index
+     * @param i2            The second index
+     * @param i3            The third index
+     * @param i4            The fourth index
+     * @param i5            The fifth index
+     */
+    HOST_DEVICE inline TYPE &operator()(
+        size_t i1, size_t i2, size_t i3, size_t i4, size_t i5 ) ATTRIBUTE_INLINE
+    {
+        return d_data[d_size.index( i1, i2, i3, i4, i5 )];
+    }
+
+    /*!
+     * Access the desired element
+     * @param i1            The first index
+     * @param i2            The second index
+     * @param i3            The third index
+     * @param i4            The fourth index
+     * @param i5            The fifth index
+     */
+    HOST_DEVICE inline const TYPE &operator()(
+        size_t i1, size_t i2, size_t i3, size_t i4, size_t i5 ) const ATTRIBUTE_INLINE
+    {
+        return d_data[d_size.index( i1, i2, i3, i4, i5 )];
+    }
+
+    /*!
+     * Access the desired element as a raw pointer
+     * @param i             The global index
+     */
+    HOST_DEVICE inline TYPE *ptr( size_t i ) ATTRIBUTE_INLINE
+    {
+        return i >= d_size.length() ? nullptr : &d_data[i];
+    }
+
+    /*!
+     * Access the desired element as a raw pointer
+     * @param i             The global index
+     */
+    HOST_DEVICE inline const TYPE *ptr( size_t i ) const ATTRIBUTE_INLINE
+    {
+        return i >= d_size.length() ? nullptr : &d_data[i];
+    }
+
+    //! Get iterator to beginning of data
+    inline TYPE *begin() ATTRIBUTE_INLINE { return d_data; }
+
+    //! Get iterator to beginning of data
+    inline const TYPE *begin() const ATTRIBUTE_INLINE { return d_data; }
+
+    //! Get iterator to beginning of data
+    inline TYPE *end() ATTRIBUTE_INLINE { return d_data + d_size.length(); }
+
+    //! Get iterator to beginning of data
+    inline const TYPE *end() const ATTRIBUTE_INLINE { return d_data + d_size.length(); }
+
+    //! Return the pointer to the raw data
+    inline std::shared_ptr<TYPE> getPtr() ATTRIBUTE_INLINE { return d_ptr; }
+
+    //! Return the pointer to the raw data
+    inline std::shared_ptr<const TYPE> getPtr() const ATTRIBUTE_INLINE { return d_ptr; }
+
+    //! Return the pointer to the raw data
+    HOST_DEVICE inline TYPE *data() ATTRIBUTE_INLINE { return d_data; }
+
+    //! Return the pointer to the raw data
+    HOST_DEVICE inline const TYPE *data() const ATTRIBUTE_INLINE { return d_data; }
+
+
+public: // Operator overloading
     //! Check if two matrices are equal
     // Equality means the dimensions and data have to be identical
     bool operator==( const Array &rhs ) const;
@@ -461,19 +810,28 @@ public:
     //! Check if two matrices are not equal
     inline bool operator!=( const Array &rhs ) const { return !this->operator==( rhs ); }
 
+    //! Add another array
+    Array &operator+=( const Array &rhs );
 
-    //! Return the pointer to the raw data
-    inline std::shared_ptr<TYPE> getPtr() { return d_ptr; }
+    //! Subtract another array
+    Array &operator-=( const Array &rhs );
 
-    //! Return the pointer to the raw data
-    inline std::shared_ptr<const TYPE> getPtr() const { return d_ptr; }
+    //! Add a scalar
+    Array &operator+=( const TYPE &rhs );
 
-    //! Return the pointer to the raw data
-    HOST_DEVICE inline TYPE *data() { return d_data; }
+    //! Subtract a scalar
+    Array &operator-=( const TYPE &rhs );
 
-    //! Return the pointer to the raw data
-    HOST_DEVICE inline const TYPE *data() const { return d_data; }
 
+public: // Math operations
+    //! Concatenates the arrays along the dimension dim.
+    static Array cat( const std::vector<Array> &x, int dim = 0 );
+
+    //! Concatenates a given array with the current array
+    void cat( const Array &x, int dim = 0 );
+
+    //! Initialize the array with random values (defined from the function table)
+    void rand();
 
     //! Return true if NaNs are present
     inline bool NaNs() const;
@@ -491,13 +849,13 @@ public:
     inline TYPE mean() const;
 
     //! Return the min of all elements in a given direction
-    Array<TYPE> min( int dir ) const;
+    Array<TYPE, FUN> min( int dir ) const;
 
     //! Return the max of all elements in a given direction
-    Array<TYPE> max( int dir ) const;
+    Array<TYPE, FUN> max( int dir ) const;
 
     //! Return the sum of all elements in a given direction
-    Array<TYPE> sum( int dir ) const;
+    Array<TYPE, FUN> sum( int dir ) const;
 
     //! Return the smallest value
     inline TYPE min( const std::vector<size_t> &index ) const;
@@ -511,52 +869,86 @@ public:
     //! Return the mean of all elements
     inline TYPE mean( const std::vector<size_t> &index ) const;
 
+    //! Return the smallest value
+    inline TYPE min( const std::vector<Range<size_t>> &index ) const;
+
+    //! Return the largest value
+    inline TYPE max( const std::vector<Range<size_t>> &index ) const;
+
+    //! Return the sum of all elements
+    inline TYPE sum( const std::vector<Range<size_t>> &index ) const;
+
+    //! Return the mean of all elements
+    inline TYPE mean( const std::vector<Range<size_t>> &index ) const;
+
     //! Find all elements that match the operator
     std::vector<size_t> find(
         const TYPE &value, std::function<bool( const TYPE &, const TYPE & )> compare ) const;
 
-    //! Add another array
-    Array &operator+=( const Array &rhs );
-
-    //! Subtract another array
-    Array &operator-=( const Array &rhs );
-
-    //! Add a scalar
-    Array &operator+=( const TYPE &rhs );
-
-    //! Subtract a scalar
-    Array &operator-=( const TYPE &rhs );
 
     //! Print an array
-    void print( std::ostream& os, const std::string& name="A", const std::string& prefix="" ) const;
+    void print(
+        std::ostream &os, const std::string &name = "A", const std::string &prefix = "" ) const;
 
     //! Multiply two arrays
-    static Array multiply( const Array& a, const Array& b );
+    static Array multiply( const Array &a, const Array &b );
 
     //! Transpose an array
-    Array<TYPE> reverseDim( ) const;
+    Array<TYPE, FUN> reverseDim() const;
+
+    //! Replicate an array a given number of times in each direction
+    Array<TYPE, FUN> repmat( const std::vector<size_t> &N ) const;
 
     //! Coarsen an array using the given filter
-    Array<TYPE> coarsen( const Array<TYPE>& filter ) const;
+    Array<TYPE, FUN> coarsen( const Array<TYPE, FUN> &filter ) const;
 
     //! Coarsen an array using the given filter
-    Array<TYPE> coarsen( const std::vector<size_t>& ratio, std::function<TYPE(const Array<TYPE>&)> filter ) const;
+    Array<TYPE, FUN> coarsen( const std::vector<size_t> &ratio,
+        std::function<TYPE( const Array<TYPE, FUN> & )> filter ) const;
+
+    /*!
+     * Perform a element-wise operation y = f(x)
+     * @param[in] fun           The function operation
+     * @param[in] x             The input array
+     */
+    static Array transform( std::function<TYPE( const TYPE & )> fun, const Array &x );
+
+    /*!
+     * Perform a element-wise operation z = f(x,y)
+     * @param[in] fun           The function operation
+     * @param[in] x             The first array
+     * @param[in] y             The second array
+     */
+    static Array transform(
+        std::function<TYPE( const TYPE &, const TYPE & )> fun, const Array &x, const Array &y );
+
+    /*!
+     * axpby operation: this = alpha*x + beta*this
+     * @param[in] alpha         alpha
+     * @param[in] x             x
+     * @param[in] beta          beta
+     */
+    void axpby( const TYPE &alpha, const Array<TYPE, FUN> &x, const TYPE &beta );
 
 private:
-    int d_ndim;                  // Number of dimensions in array
-    size_t d_N[ARRAY_NDIM_MAX];  // Size of each dimension
-    size_t d_length;             // Total length of array
+    ArraySize d_size;            // Size of each dimension
     TYPE *d_data;                // Raw pointer to data in array
     std::shared_ptr<TYPE> d_ptr; // Shared pointer to data in array
-    void allocate( const std::vector<size_t> &N );
+    void allocate( const ArraySize &N );
+
+public:
+    template<class TYPE2, class FUN2>
+    inline bool sizeMatch( const Array<TYPE2, FUN2> &rhs ) const
+    {
+        return d_size == rhs.d_size;
+    }
 
 private:
-    template<class TYPE2>
-    inline bool sizeMatch( const Array<TYPE2>& rhs ) const;
-    inline void checkSubsetIndex( const std::vector<size_t> &index ) const;
-    inline std::array<size_t, 5> getDimArray() const;
-    static inline void getSubsetArrays( const std::vector<size_t> &index,
-        std::array<size_t, 5> &first, std::array<size_t, 5> &last, std::array<size_t, 5> &N );
+    inline void checkSubsetIndex( const std::vector<Range<size_t>> &range ) const;
+    inline std::vector<Range<size_t>> convert( const std::vector<size_t> &index ) const;
+    static inline void getSubsetArrays( const std::vector<Range<size_t>> &range,
+        std::array<size_t, 5> &first, std::array<size_t, 5> &last, std::array<size_t, 5> &inc,
+        std::array<size_t, 5> &N );
 };
 
 
diff --git a/common/Array.hpp b/common/Array.hpp
index aa06cc2c..b91e46b4 100644
--- a/common/Array.hpp
+++ b/common/Array.hpp
@@ -2,267 +2,396 @@
 #define included_ArrayClass_hpp
 
 #include "common/Array.h"
+#include "common/FunctionTable.h"
 #include "common/Utilities.h"
 #include <algorithm>
+#include <cmath>
 #include <cstring>
 #include <limits>
-#include <stdexcept>
-
 
 
 /********************************************************
-*  Constructors                                         *
-********************************************************/
-template <class TYPE>
-Array<TYPE>::Array()
+ *  ArraySize                                            *
+ ********************************************************/
+inline ArraySize::ArraySize()
 {
     d_ndim   = 1;
+    d_N[0]   = 0;
+    d_N[1]   = 1;
+    d_N[2]   = 1;
+    d_N[3]   = 1;
+    d_N[4]   = 1;
     d_length = 0;
-    for ( size_t i = 0; i < ARRAY_NDIM_MAX; i++ )
-        d_N[i]     = 1;
-    d_N[0]         = 0;
-    d_data         = nullptr;
 }
-template <class TYPE>
-Array<TYPE>::Array( size_t N )
+inline ArraySize::ArraySize( size_t N1 )
 {
-    allocate( std::vector<size_t>( 1, N ) );
+    d_ndim   = 1;
+    d_N[0]   = N1;
+    d_N[1]   = 1;
+    d_N[2]   = 1;
+    d_N[3]   = 1;
+    d_N[4]   = 1;
+    d_length = N1;
 }
-template <class TYPE>
-Array<TYPE>::Array( size_t N_rows, size_t N_columns )
+inline ArraySize::ArraySize( size_t N1, size_t N2 )
 {
-    std::vector<size_t> N( 2 );
-    N[0] = N_rows;
-    N[1] = N_columns;
-    allocate( N );
+    d_ndim   = 2;
+    d_N[0]   = N1;
+    d_N[1]   = N2;
+    d_N[2]   = 1;
+    d_N[3]   = 1;
+    d_N[4]   = 1;
+    d_length = N1 * N2;
 }
-template <class TYPE>
-Array<TYPE>::Array( size_t N1, size_t N2, size_t N3 )
+inline ArraySize::ArraySize( size_t N1, size_t N2, size_t N3 )
 {
-    std::vector<size_t> N( 3 );
-    N[0] = N1;
-    N[1] = N2;
-    N[2] = N3;
-    allocate( N );
+    d_ndim   = 3;
+    d_N[0]   = N1;
+    d_N[1]   = N2;
+    d_N[2]   = N3;
+    d_N[3]   = 1;
+    d_N[4]   = 1;
+    d_length = N1 * N2 * N3;
 }
-template <class TYPE>
-Array<TYPE>::Array( const std::vector<size_t> &N, const TYPE *data )
+inline ArraySize::ArraySize( size_t N1, size_t N2, size_t N3, size_t N4 )
 {
-    allocate( N );
-    if ( data != NULL ) {
-        for ( size_t i = 0; i < d_length; i++ )
-            d_data[i]  = data[i];
-    }
+    d_ndim   = 4;
+    d_N[0]   = N1;
+    d_N[1]   = N2;
+    d_N[2]   = N3;
+    d_N[3]   = N4;
+    d_N[4]   = 1;
+    d_length = N1 * N2 * N3 * N4;
 }
-template <class TYPE>
-void Array<TYPE>::allocate( const std::vector<size_t> &N )
+inline ArraySize::ArraySize( size_t N1, size_t N2, size_t N3, size_t N4, size_t N5 )
 {
-    d_ndim   = static_cast<int>( N.size() );
+    d_ndim   = 5;
+    d_N[0]   = N1;
+    d_N[1]   = N2;
+    d_N[2]   = N3;
+    d_N[3]   = N4;
+    d_N[4]   = N5;
+    d_length = N1 * N2 * N3 * N4 * N5;
+}
+inline ArraySize::ArraySize( std::initializer_list<size_t> N )
+{
+    d_ndim  = N.size();
+    d_N[0]  = 0;
+    d_N[1]  = 1;
+    d_N[2]  = 1;
+    d_N[3]  = 1;
+    d_N[4]  = 1;
+    auto it = N.begin();
+    for ( size_t i = 0; i < d_ndim; i++, ++it )
+        d_N[i] = *it;
     d_length = 1;
-    for ( size_t i = 0; i < ARRAY_NDIM_MAX; i++ )
-        d_N[i]     = 1;
-    for ( size_t i = 0; i < N.size(); i++ ) {
-        d_N[i] = N[i];
-        d_length *= N[i];
-    }
-    if ( N.empty() ) {
-        d_N[0]   = 0;
+    for ( size_t i = 0; i < maxDim(); i++ )
+        d_length *= d_N[i];
+    if ( d_ndim == 0 )
         d_length = 0;
+}
+inline ArraySize::ArraySize( size_t ndim, const size_t *dims )
+{
+    d_ndim = ndim;
+    d_N[0] = 0;
+    d_N[1] = 1;
+    d_N[2] = 1;
+    d_N[3] = 1;
+    d_N[4] = 1;
+    for ( size_t i = 0; i < ndim; i++ )
+        d_N[i] = dims[i];
+    d_length = 1;
+    for ( size_t i = 0; i < maxDim(); i++ )
+        d_length *= d_N[i];
+    if ( d_ndim == 0 )
+        d_length = 0;
+}
+inline ArraySize::ArraySize( const std::vector<size_t> &N )
+{
+    d_ndim = N.size();
+    d_N[0] = 0;
+    d_N[1] = 1;
+    d_N[2] = 1;
+    d_N[3] = 1;
+    d_N[4] = 1;
+    for ( size_t i = 0; i < d_ndim; i++ )
+        d_N[i] = N[i];
+    d_length = 1;
+    for ( size_t i = 0; i < maxDim(); i++ )
+        d_length *= d_N[i];
+    if ( d_ndim == 0 )
+        d_length = 0;
+}
+inline ArraySize::ArraySize( const ArraySize &rhs ) { memcpy( this, &rhs, sizeof( *this ) ); }
+inline ArraySize::ArraySize( ArraySize &&rhs ) { memcpy( this, &rhs, sizeof( *this ) ); }
+inline ArraySize &ArraySize::operator=( const ArraySize &rhs )
+{
+    if ( this != &rhs )
+        memcpy( this, &rhs, sizeof( *this ) );
+    return *this;
+}
+inline ArraySize &ArraySize::operator=( ArraySize &&rhs )
+{
+    if ( this != &rhs )
+        memcpy( this, &rhs, sizeof( *this ) );
+    return *this;
+}
+inline void ArraySize::resize( uint8_t dim, size_t N )
+{
+    if ( dim >= d_ndim )
+        throw std::out_of_range( "Invalid dimension" );
+    d_N[dim] = N;
+    d_length = 1;
+    for ( size_t i = 0; i < maxDim(); i++ )
+        d_length *= d_N[i];
+}
+
+
+/********************************************************
+ *  Constructors                                         *
+ ********************************************************/
+template<class TYPE, class FUN>
+Array<TYPE, FUN>::Array()
+{
+    d_data = nullptr;
+}
+template<class TYPE, class FUN>
+Array<TYPE, FUN>::Array( const ArraySize &N )
+{
+    allocate( N );
+}
+template<class TYPE, class FUN>
+Array<TYPE, FUN>::Array( size_t N )
+{
+    allocate( ArraySize( N ) );
+}
+template<class TYPE, class FUN>
+Array<TYPE, FUN>::Array( size_t N_rows, size_t N_cols )
+{
+    allocate( ArraySize( N_rows, N_cols ) );
+}
+template<class TYPE, class FUN>
+Array<TYPE, FUN>::Array( size_t N1, size_t N2, size_t N3 )
+{
+    allocate( ArraySize( N1, N2, N3 ) );
+}
+template<class TYPE, class FUN>
+Array<TYPE, FUN>::Array( size_t N1, size_t N2, size_t N3, size_t N4 )
+{
+    allocate( ArraySize( N1, N2, N3, N4 ) );
+}
+template<class TYPE, class FUN>
+Array<TYPE, FUN>::Array( size_t N1, size_t N2, size_t N3, size_t N4, size_t N5 )
+{
+    allocate( ArraySize( N1, N2, N3, N4, N5 ) );
+}
+template<class TYPE, class FUN>
+Array<TYPE, FUN>::Array( const std::vector<size_t> &N, const TYPE *data )
+{
+    allocate( N );
+    if ( data ) {
+        for ( size_t i = 0; i < d_size.length(); i++ )
+            d_data[i] = data[i];
     }
-    if ( d_length == 0 )
+}
+template<class TYPE, class FUN>
+Array<TYPE, FUN>::Array( const Range<TYPE> &range )
+{
+    double tmp = static_cast<double>( ( range.j - range.i ) ) / static_cast<double>( range.k );
+    size_t N   = static_cast<size_t>( floor( tmp + 1e-12 ) + 1 );
+    allocate( { N } );
+    for ( size_t i = 0; i < N; i++ )
+        d_data[i] = range.k * ( range.i / range.k + i );
+}
+template<class TYPE, class FUN>
+Array<TYPE, FUN>::Array( std::initializer_list<TYPE> x )
+{
+    allocate( { x.size() } );
+    auto it = x.begin();
+    for ( size_t i = 0; i < x.size(); ++i, ++it )
+        d_data[i] = *it;
+}
+template<class TYPE, class FUN>
+void Array<TYPE, FUN>::allocate( const ArraySize &N )
+{
+    d_size      = N;
+    auto length = d_size.length();
+    if ( length == 0 )
         d_ptr.reset();
     else
-        d_ptr.reset( new ( std::nothrow ) TYPE[d_length], []( TYPE *p ) { delete[] p; } );
+        d_ptr.reset( new ( std::nothrow ) TYPE[length], []( TYPE *p ) { delete[] p; } );
     d_data = d_ptr.get();
-    if ( d_length > 0 && d_data == nullptr )
+    if ( length > 0 && d_data == nullptr )
         throw std::logic_error( "Failed to allocate array" );
 }
-template <class TYPE>
-Array<TYPE>::Array( const Array &rhs )
-    : d_ndim( rhs.d_ndim ), d_length( rhs.d_length ), d_data( nullptr )
+template<class TYPE, class FUN>
+Array<TYPE, FUN>::Array( const Array &rhs ) : d_size( rhs.d_size ), d_data( nullptr )
 {
     allocate( rhs.size() );
-    for ( size_t i = 0; i < d_length; i++ )
-        d_data[i]  = rhs.d_data[i];
+    for ( size_t i = 0; i < d_size.length(); i++ )
+        d_data[i] = rhs.d_data[i];
 }
-template <class TYPE>
-Array<TYPE>::Array( Array &&rhs )
-    : d_ndim( rhs.d_ndim ), d_length( rhs.d_length ), d_data( rhs.d_data )
+template<class TYPE, class FUN>
+Array<TYPE, FUN>::Array( Array &&rhs ) : d_size( rhs.d_size ), d_data( rhs.d_data )
 {
-    rhs.d_ndim = 0;
-    memcpy( d_N, rhs.d_N, sizeof( rhs.d_N ) );
-    memset( rhs.d_N, 0, sizeof( rhs.d_N ) );
-    rhs.d_length = 0;
-    rhs.d_data   = nullptr;
-    d_ptr        = std::move( rhs.d_ptr );
+    rhs.d_size = ArraySize();
+    rhs.d_data = nullptr;
+    d_ptr      = std::move( rhs.d_ptr );
 }
-template <class TYPE>
-Array<TYPE> &Array<TYPE>::operator=( const Array &rhs )
+template<class TYPE, class FUN>
+Array<TYPE, FUN> &Array<TYPE, FUN>::operator=( const Array &rhs )
 {
     if ( this == &rhs )
         return *this;
     this->allocate( rhs.size() );
-    for ( size_t i      = 0; i < d_length; i++ )
+    for ( size_t i = 0; i < d_size.length(); i++ )
         this->d_data[i] = rhs.d_data[i];
     return *this;
 }
-template <class TYPE>
-Array<TYPE> &Array<TYPE>::operator=( Array &&rhs )
+template<class TYPE, class FUN>
+Array<TYPE, FUN> &Array<TYPE, FUN>::operator=( Array &&rhs )
 {
     if ( this == &rhs )
         return *this;
-    d_ndim     = rhs.d_ndim;
-    rhs.d_ndim = 0;
-    memcpy( d_N, rhs.d_N, sizeof( rhs.d_N ) );
-    memset( rhs.d_N, 0, sizeof( rhs.d_N ) );
-    d_length     = rhs.d_length;
-    rhs.d_length = 0;
-    d_data       = rhs.d_data;
-    rhs.d_data   = nullptr;
-    d_ptr        = std::move( rhs.d_ptr );
+    d_size     = rhs.d_size;
+    rhs.d_size = ArraySize();
+    d_data     = rhs.d_data;
+    rhs.d_data = nullptr;
+    d_ptr      = std::move( rhs.d_ptr );
     return *this;
 }
-template <class TYPE>
-Array<TYPE> &Array<TYPE>::operator=( const std::vector<TYPE> &rhs )
+template<class TYPE, class FUN>
+Array<TYPE, FUN> &Array<TYPE, FUN>::operator=( const std::vector<TYPE> &rhs )
 {
-    this->allocate( std::vector<size_t>( 1, rhs.size() ) );
-    for ( size_t i      = 0; i < rhs.size(); i++ )
+    this->allocate( ArraySize( rhs.size() ) );
+    for ( size_t i = 0; i < rhs.size(); i++ )
         this->d_data[i] = rhs[i];
     return *this;
 }
-template <class TYPE>
-Array<TYPE>::~Array()
+template<class TYPE, class FUN>
+Array<TYPE, FUN>::~Array()
 {
 }
-template <class TYPE>
-void Array<TYPE>::clear()
+template<class TYPE, class FUN>
+void Array<TYPE, FUN>::clear()
 {
-    d_ndim   = 0;
-    d_length = 0;
-    for ( size_t i = 0; i < ARRAY_NDIM_MAX; i++ )
-        d_N[i]     = 1;
-    d_N[0]         = 0;
+    d_size = ArraySize();
     d_ptr.reset();
     d_data = nullptr;
 }
 
 
 /********************************************************
-*  Check if the size of the array matches rhs           *
-********************************************************/
-template <class TYPE>
-template <class TYPE2>
-bool Array<TYPE>::sizeMatch( const Array<TYPE2>& rhs ) const
-{
-    bool test = d_ndim == rhs.d_ndim;
-    for ( int d = 0; d < d_ndim; d++ )
-        test = test && d_N[d] == rhs.d_N[d];
-    return test;
-}
+ *  Access elements                                      *
+ ********************************************************/
 
 
 /********************************************************
-*  Resize the array                                     *
-********************************************************/
-template <class TYPE>
-void Array<TYPE>::resize( size_t N )
+ *  Copy/move values from one array to another (resize)  *
+ ********************************************************/
+template<class TYPE>
+inline void moveValues( const ArraySize &N1, const ArraySize &N2, TYPE *data1, TYPE *data2 )
 {
-    resize( std::vector<size_t>{N} );
-}
-template <class TYPE>
-void Array<TYPE>::resize( size_t N1, size_t N2 )
-{
-    resize( std::vector<size_t>{N1,N2} );
-}
-template <class TYPE>
-void Array<TYPE>::resize( size_t N1, size_t N2, size_t N3 )
-{
-    resize( std::vector<size_t>{N1,N2,N3} );
-}
-template <class TYPE>
-void Array<TYPE>::resize( const std::vector<size_t> &N )
-{
-    // Check if the array actually changed size
-    size_t new_length = 1;
-    for ( size_t i = 0; i < N.size(); i++ )
-        new_length *= N[i];
-    if ( N.empty() )
-        new_length = 0;
-    bool changed = new_length != d_length || (int) N.size() != d_ndim;
-    for ( size_t i = 0; i < N.size(); i++ )
-        changed = changed || N[i] != d_N[i];
-    if ( !changed )
-        return;
-// Store the old data
-#if ARRAY_NDIM_MAX > 5
-#error Function programmed for more than 5 dimensions
-#endif
-    std::array<size_t, 5> N1{ { 1, 1, 1, 1, 1 } };
-    std::array<size_t, 5> N2{ { 1, 1, 1, 1, 1 } };
-    for ( int d = 0; d < d_ndim; d++ )
-        N1[d]   = d_N[d];
-    for ( size_t d = 0; d < N.size(); d++ )
-        N2[d]      = N[d];
-    if ( d_ndim == 0 ) {
-        N1[0] = 0;
-    }
-    if ( N.empty() ) {
-        N2[0] = 0;
-    }
-    std::shared_ptr<TYPE> old_data = d_ptr;
-    // Allocate new data
-    allocate( N );
-    // Copy the old values
-    if ( d_length > 0 ) {
-        TYPE *data1 = old_data.get();
-        TYPE *data2 = d_data;
-        if ( old_data.unique() ) {
-            // We own the data, use std:move
-            for ( size_t i5 = 0; i5 < std::min( N1[4], N2[4] ); i5++ ) {
-                for ( size_t i4 = 0; i4 < std::min( N1[3], N2[3] ); i4++ ) {
-                    for ( size_t i3 = 0; i3 < std::min( N1[2], N2[2] ); i3++ ) {
-                        for ( size_t i2 = 0; i2 < std::min( N1[1], N2[1] ); i2++ ) {
-                            for ( size_t i1 = 0; i1 < std::min( N1[0], N2[0] ); i1++ ) {
-                                size_t index1 = GET_ARRAY_INDEX5D( N1, i1, i2, i3, i4, i5 );
-                                size_t index2 = GET_ARRAY_INDEX5D( N2, i1, i2, i3, i4, i5 );
-                                data2[index2] = std::move( data1[index1] );
-                            }
-                        }
-                    }
-                }
-            }
-        } else {
-            // We do not own the data, copy
-            for ( size_t i5 = 0; i5 < std::min( N1[4], N2[4] ); i5++ ) {
-                for ( size_t i4 = 0; i4 < std::min( N1[3], N2[3] ); i4++ ) {
-                    for ( size_t i3 = 0; i3 < std::min( N1[2], N2[2] ); i3++ ) {
-                        for ( size_t i2 = 0; i2 < std::min( N1[1], N2[1] ); i2++ ) {
-                            for ( size_t i1 = 0; i1 < std::min( N1[0], N2[0] ); i1++ ) {
-                                size_t index1 = GET_ARRAY_INDEX5D( N1, i1, i2, i3, i4, i5 );
-                                size_t index2 = GET_ARRAY_INDEX5D( N2, i1, i2, i3, i4, i5 );
-                                data2[index2] = data1[index1];
-                            }
-                        }
+    for ( size_t i5 = 0; i5 < std::min( N1[4], N2[4] ); i5++ ) {
+        for ( size_t i4 = 0; i4 < std::min( N1[3], N2[3] ); i4++ ) {
+            for ( size_t i3 = 0; i3 < std::min( N1[2], N2[2] ); i3++ ) {
+                for ( size_t i2 = 0; i2 < std::min( N1[1], N2[1] ); i2++ ) {
+                    for ( size_t i1 = 0; i1 < std::min( N1[0], N2[0] ); i1++ ) {
+                        size_t index1 = N1.index( i1, i2, i3, i4, i5 );
+                        size_t index2 = N2.index( i1, i2, i3, i4, i5 );
+                        data2[index2] = std::move( data1[index1] );
                     }
                 }
             }
         }
     }
 }
-template <class TYPE>
-void Array<TYPE>::resizeDim( int dim, size_t N, const TYPE &value )
+template<bool test, class TYPE>
+inline typename std::enable_if<test, void>::type copyValues(
+    const ArraySize &N1, const ArraySize &N2, const TYPE *data1, TYPE *data2 )
 {
-    if ( dim >= d_ndim )
-        throw std::logic_error( "Invalid dimension" );
-    std::vector<size_t> N2 = size();
-    size_t N0              = N2[dim];
-    N2[dim]                = N;
-    resize( N2 );
+    for ( size_t i5 = 0; i5 < std::min( N1[4], N2[4] ); i5++ ) {
+        for ( size_t i4 = 0; i4 < std::min( N1[3], N2[3] ); i4++ ) {
+            for ( size_t i3 = 0; i3 < std::min( N1[2], N2[2] ); i3++ ) {
+                for ( size_t i2 = 0; i2 < std::min( N1[1], N2[1] ); i2++ ) {
+                    for ( size_t i1 = 0; i1 < std::min( N1[0], N2[0] ); i1++ ) {
+                        size_t index1 = N1.index( i1, i2, i3, i4, i5 );
+                        size_t index2 = N2.index( i1, i2, i3, i4, i5 );
+                        data2[index2] = data1[index1];
+                    }
+                }
+            }
+        }
+    }
+}
+template<bool test, class TYPE>
+inline typename std::enable_if<!test, void>::type copyValues(
+    const ArraySize &, const ArraySize &, const TYPE *, TYPE * )
+{
+    throw std::logic_error( "No copy constructor" );
+}
+
+
+/********************************************************
+ *  Resize the array                                     *
+ ********************************************************/
+template<class TYPE, class FUN>
+void Array<TYPE, FUN>::resize( size_t N )
+{
+    resize( ArraySize( N ) );
+}
+template<class TYPE, class FUN>
+void Array<TYPE, FUN>::resize( size_t N1, size_t N2 )
+{
+    resize( ArraySize( N1, N2 ) );
+}
+template<class TYPE, class FUN>
+void Array<TYPE, FUN>::resize( size_t N1, size_t N2, size_t N3 )
+{
+    resize( ArraySize( N1, N2, N3 ) );
+}
+
+template<class TYPE, class FUN>
+void Array<TYPE, FUN>::resize( const ArraySize &N )
+{
+    // Check if the array actually changed size
+    bool equal = true;
+    for ( size_t i = 0; i < ArraySize::maxDim(); i++ )
+        equal = equal && N[i] == d_size[i];
+    if ( equal ) {
+        d_size = N;
+        return;
+    }
+    // Store the old data
+    auto N0    = d_size;
+    auto data0 = d_ptr;
+    // Allocate new data
+    allocate( N );
+    // Copy the old values
+    if ( N.length() > 0 && d_size.length() > 0 ) {
+        if ( data0.use_count() <= 1 ) {
+            // We own the data, use std:move
+            moveValues( N0, N, data0.get(), d_data );
+        } else {
+            // We do not own the data, copy
+            copyValues<std::is_copy_constructible<TYPE>::value, TYPE>( N0, N, data0.get(), d_data );
+        }
+    }
+}
+template<class TYPE, class FUN>
+void Array<TYPE, FUN>::resizeDim( int dim, size_t N, const TYPE &value )
+{
+    if ( dim < 0 || dim > d_size.ndim() )
+        throw std::out_of_range( "Invalid dimension" );
+    size_t N0 = d_size[dim];
+    auto size = d_size;
+    size.resize( dim, N );
+    resize( size );
     size_t n1 = 1, n2 = 1;
     for ( int d = 0; d < dim; d++ )
-        n1 *= N2[d];
-    for ( size_t d = dim + 1; d < N2.size(); d++ )
-        n2 *= N2[d];
+        n1 *= size[d];
+    for ( size_t d = dim + 1; d < size.ndim(); d++ )
+        n2 *= size[d];
     for ( size_t k = 0; k < n2; k++ ) {
         for ( size_t j = N0; j < N; j++ ) {
             for ( size_t i = 0; i < n1; i++ ) {
@@ -274,129 +403,112 @@ void Array<TYPE>::resizeDim( int dim, size_t N, const TYPE &value )
 
 
 /********************************************************
-*  Reshape the array                                     *
-********************************************************/
-template <class TYPE>
-void Array<TYPE>::reshape( const std::vector<size_t> &N )
+ *  Reshape the array                                     *
+ ********************************************************/
+template<class TYPE, class FUN>
+void Array<TYPE, FUN>::reshape( const ArraySize &N )
 {
-    size_t new_length = 1;
-    for ( size_t i = 0; i < N.size(); i++ )
-        new_length *= N[i];
-    if ( new_length != d_length )
+    if ( N.length() != d_size.length() )
         throw std::logic_error( "reshape is not allowed to change the array size" );
-    d_ndim = N.size();
-    for ( size_t i = 0; i < ARRAY_NDIM_MAX; i++ )
-        d_N[i]     = 1;
-    for ( size_t i = 0; i < N.size(); i++ )
-        d_N[i]     = N[i];
+    d_size = N;
 }
 
 
 /********************************************************
-*  Subset the array                                     *
-********************************************************/
-// clang-format off
+ *  Subset the array                                     *
+ ********************************************************/
 // Helper function to check subset indices
-template <class TYPE>
-inline void Array<TYPE>::checkSubsetIndex( const std::vector<size_t> &index ) const
+template<class TYPE, class FUN>
+inline void Array<TYPE, FUN>::checkSubsetIndex( const std::vector<Range<size_t>> &range ) const
 {
-    bool test = index.size() % 2 == 0 && (int) index.size() / 2 <= d_ndim;
-    for ( size_t d = 0; d < index.size() / 2; d++ )
-        test = test && index[2 * d + 0] < d_N[d] && index[2 * d + 1] < d_N[d];
+    bool test = (int) range.size() == d_size.ndim();
+    for ( size_t d = 0; d < range.size(); d++ )
+        test = test && range[d].i >= 0 && range[d].j <= d_size[d];
     if ( !test )
         throw std::logic_error( "indices for subset are invalid" );
 }
-// Helper function to return dimensions as a std::array for hard coded loops
-template <class TYPE>
-inline std::array<size_t,5> Array<TYPE>::getDimArray() const
+template<class TYPE, class FUN>
+inline std::vector<Range<size_t>> Array<TYPE, FUN>::convert(
+    const std::vector<size_t> &index ) const
 {
-    #if ARRAY_NDIM_MAX > 5
-        #error Function programmed for more than 5 dimensions
-    #endif
-    std::array<size_t,5> N{ { 1, 1, 1, 1, 1 } };
-    for ( int d = 0; d < d_ndim; d++ )
-        N[d] = d_N[d];
-    return N;
+    std::vector<Range<size_t>> range( d_size.ndim() );
+    if ( index.size() % 2 != 0 || static_cast<int>( index.size() / 2 ) < d_size.ndim() )
+        throw std::logic_error( "indices for subset are invalid" );
+    for ( int d = 0; d < d_size.ndim(); d++ )
+        range[d] = Range<size_t>( index[2 * d + 0], index[2 * d + 1] );
+    return range;
 }
 // Helper function to return dimensions for the subset array
-template <class TYPE>
-inline void Array<TYPE>::getSubsetArrays( const std::vector<size_t> &index,
-                                          std::array<size_t, 5> &first,
-                                          std::array<size_t, 5> &last,
-                                          std::array<size_t, 5> &N )
+template<class TYPE, class FUN>
+inline void Array<TYPE, FUN>::getSubsetArrays( const std::vector<Range<size_t>> &index,
+    std::array<size_t, 5> &first, std::array<size_t, 5> &last, std::array<size_t, 5> &inc,
+    std::array<size_t, 5> &N )
 {
-    #if ARRAY_NDIM_MAX > 5
-        #error Function programmed for more than 5 dimensions
-    #endif
-    size_t ndim = index.size() / 2;
+    first.fill( 0 );
+    last.fill( 0 );
+    inc.fill( 1 );
+    N.fill( 1 );
+    size_t ndim = index.size();
     for ( size_t d = 0; d < ndim; d++ ) {
-        first[d] = index[2 * d + 0];
-        last[d]  = index[2 * d + 1];
-        N[d]     = last[d] - first[d] + 1;
-    }
-    for ( size_t d = ndim; d < 5; d++ ) {
-        first[d] = 0;
-        last[d]  = 0;
-        N[d]     = 1;
+        first[d] = index[d].i;
+        last[d]  = index[d].j;
+        inc[d]   = index[d].k;
+        N[d]     = ( last[d] - first[d] + inc[d] ) / inc[d];
     }
 }
-template <class TYPE>
-template <class TYPE2>
-Array<TYPE2> Array<TYPE>::subset( const std::vector<size_t> &index ) const
+template<class TYPE, class FUN>
+template<class TYPE2>
+Array<TYPE2, FUN> Array<TYPE, FUN>::subset( const std::vector<Range<size_t>> &index ) const
 {
     // Get the subset indicies
     checkSubsetIndex( index );
-    std::array<size_t,5> first, last, N1;
-    getSubsetArrays( index, first, last, N1 );
-    std::array<size_t,5> N2 = getDimArray();
+    std::array<size_t, 5> first, last, inc, N1;
+    getSubsetArrays( index, first, last, inc, N1 );
+    ArraySize S1( d_size.ndim(), N1.data() );
     // Create the new array
-    std::vector<size_t> dim( d_ndim );
-    for ( int d = 0; d < d_ndim; d++ )
-        dim[d]  = last[d] - first[d] + 1;
-    Array<TYPE2> subset( dim );
+    Array<TYPE2> subset_array( S1 );
     // Fill the new array
-    #if ARRAY_NDIM_MAX > 5
-        #error Function programmed for more than 5 dimensions
-    #endif
-    TYPE2 *subset_data = subset.data();
-    for (size_t i4=first[4]; i4<=last[4]; i4++) {
-        for (size_t i3=first[3]; i3<=last[3]; i3++) {
-            for (size_t i2=first[2]; i2<=last[2]; i2++) {
-                for (size_t i1=first[1]; i1<=last[1]; i1++) {
-                    for (size_t i0=first[0]; i0<=last[0]; i0++) {
-                        size_t k1 = GET_ARRAY_INDEX5D( N1, i0-first[0], 
-                            i1-first[1], i2-first[2], i3-first[3], i4-first[4] );
-                        size_t k2       = GET_ARRAY_INDEX5D( N2, i0, i1, i2, i3, i4 );
+    static_assert( ArraySize::maxDim() == 5, "Not programmed for more than 5 dimensions" );
+    TYPE2 *subset_data = subset_array.data();
+    for ( size_t i4 = first[4], k1 = 0; i4 <= last[4]; i4 += inc[4] ) {
+        for ( size_t i3 = first[3]; i3 <= last[3]; i3 += inc[3] ) {
+            for ( size_t i2 = first[2]; i2 <= last[2]; i2 += inc[2] ) {
+                for ( size_t i1 = first[1]; i1 <= last[1]; i1 += inc[1] ) {
+                    for ( size_t i0 = first[0]; i0 <= last[0]; i0 += inc[0], k1++ ) {
+                        size_t k2       = d_size.index( i0, i1, i2, i3, i4 );
                         subset_data[k1] = static_cast<TYPE2>( d_data[k2] );
                     }
                 }
             }
         }
     }
-    return subset;
+    return subset_array;
 }
-template <class TYPE>
-template <class TYPE2>
-void Array<TYPE>::copySubset( const std::vector<size_t> &index, const Array<TYPE2> &subset )
+template<class TYPE, class FUN>
+template<class TYPE2>
+Array<TYPE2, FUN> Array<TYPE, FUN>::subset( const std::vector<size_t> &index ) const
+{
+    auto range = convert( index );
+    return subset( range );
+}
+template<class TYPE, class FUN>
+template<class TYPE2>
+void Array<TYPE, FUN>::copySubset(
+    const std::vector<Range<size_t>> &index, const Array<TYPE2, FUN> &subset )
 {
     // Get the subset indices
     checkSubsetIndex( index );
-    std::array<size_t,5> first, last, N1;
-    getSubsetArrays( index, first, last, N1 );
-    std::array<size_t,5> N2 = getDimArray();
+    std::array<size_t, 5> first, last, inc, N1;
+    getSubsetArrays( index, first, last, inc, N1 );
     // Copy the sub-array
-    #if ARRAY_NDIM_MAX > 5
-        #error Function programmed for more than 5 dimensions
-    #endif
+    static_assert( ArraySize::maxDim() == 5, "Not programmed for more than 5 dimensions" );
     const TYPE2 *src_data = subset.data();
-    for (size_t i4=first[4]; i4<=last[4]; i4++) {
-        for (size_t i3=first[3]; i3<=last[3]; i3++) {
-            for (size_t i2=first[2]; i2<=last[2]; i2++) {
-                for (size_t i1=first[1]; i1<=last[1]; i1++) {
-                    for (size_t i0=first[0]; i0<=last[0]; i0++) {
-                        size_t k1 = GET_ARRAY_INDEX5D( N1, i0-first[0], 
-                            i1-first[1], i2-first[2], i3-first[3], i4-first[4] );
-                        size_t k2  = GET_ARRAY_INDEX5D( N2, i0, i1, i2, i3, i4 );
+    for ( size_t i4 = first[4], k1 = 0; i4 <= last[4]; i4 += inc[4] ) {
+        for ( size_t i3 = first[3]; i3 <= last[3]; i3 += inc[3] ) {
+            for ( size_t i2 = first[2]; i2 <= last[2]; i2 += inc[2] ) {
+                for ( size_t i1 = first[1]; i1 <= last[1]; i1 += inc[1] ) {
+                    for ( size_t i0 = first[0]; i0 <= last[0]; i0 += inc[0], k1++ ) {
+                        size_t k2  = d_size.index( i0, i1, i2, i3, i4 );
                         d_data[k2] = static_cast<TYPE>( src_data[k1] );
                     }
                 }
@@ -405,26 +517,22 @@ void Array<TYPE>::copySubset( const std::vector<size_t> &index, const Array<TYPE
     }
 }
 
-template <class TYPE>
-void Array<TYPE>::addSubset( const std::vector<size_t> &index, const Array<TYPE> &subset )
+template<class TYPE, class FUN>
+void Array<TYPE, FUN>::addSubset(
+    const std::vector<Range<size_t>> &index, const Array<TYPE, FUN> &subset )
 {
     // Get the subset indices
     checkSubsetIndex( index );
-    std::array<size_t,5> first, last, N1;
-    getSubsetArrays( index, first, last, N1 );
-    std::array<size_t,5> N2 = getDimArray();
+    std::array<size_t, 5> first, last, inc, N1;
+    getSubsetArrays( index, first, last, inc, N1 );
     // add the sub-array
-    #if ARRAY_NDIM_MAX > 5
-        #error Function programmed for more than 5 dimensions
-    #endif
-    for (size_t i4=first[4]; i4<=last[4]; i4++) {
-        for (size_t i3=first[3]; i3<=last[3]; i3++) {
-            for (size_t i2=first[2]; i2<=last[2]; i2++) {
-                for (size_t i1=first[1]; i1<=last[1]; i1++) {
-                    for (size_t i0=first[0]; i0<=last[0]; i0++) {
-                        size_t k1 = GET_ARRAY_INDEX5D( N1, i0-first[0], 
-                            i1-first[1], i2-first[2], i3-first[3], i4-first[4] );
-                        size_t k2  = GET_ARRAY_INDEX5D( N2, i0, i1, i2, i3, i4 );
+    static_assert( ArraySize::maxDim() == 5, "Not programmed for more than 5 dimensions" );
+    for ( size_t i4 = first[4], k1 = 0; i4 <= last[4]; i4 += inc[4] ) {
+        for ( size_t i3 = first[3]; i3 <= last[3]; i3 += inc[3] ) {
+            for ( size_t i2 = first[2]; i2 <= last[2]; i2 += inc[2] ) {
+                for ( size_t i1 = first[1]; i1 <= last[1]; i1 += inc[1] ) {
+                    for ( size_t i0 = first[0]; i0 <= last[0]; i0 += inc[0], k1++ ) {
+                        size_t k2 = d_size.index( i0, i1, i2, i3, i4 );
                         d_data[k2] += subset.d_data[k1];
                     }
                 }
@@ -432,155 +540,134 @@ void Array<TYPE>::addSubset( const std::vector<size_t> &index, const Array<TYPE>
         }
     }
 }
-// clang-format on
+template<class TYPE, class FUN>
+template<class TYPE2>
+void Array<TYPE, FUN>::copySubset(
+    const std::vector<size_t> &index, const Array<TYPE2, FUN> &subset )
+{
+    auto range = convert( index );
+    copySubset( range, subset );
+}
+
+template<class TYPE, class FUN>
+void Array<TYPE, FUN>::addSubset( const std::vector<size_t> &index, const Array<TYPE, FUN> &subset )
+{
+    auto range = convert( index );
+    addSubset( range, subset );
+}
 
 
 /********************************************************
-*  Operator overloading                                 *
-********************************************************/
-template <class TYPE>
-bool Array<TYPE>::operator==( const Array &rhs ) const
+ *  Operator overloading                                 *
+ ********************************************************/
+template<class TYPE, class FUN>
+bool Array<TYPE, FUN>::operator==( const Array &rhs ) const
 {
     if ( this == &rhs )
         return true;
-    if ( d_length != rhs.d_length )
+    if ( d_size != rhs.d_size )
         return false;
-    if ( d_ndim != rhs.d_ndim )
-        return false;
-    for ( int d = 0; d < d_ndim; d++ ) {
-        if ( d_N[d] != rhs.d_N[d] )
-            return false;
-    }
     bool match = true;
-    for ( size_t i = 0; i < d_length; i++ )
+    for ( size_t i = 0; i < d_size.length(); i++ )
         match = match && d_data[i] == rhs.d_data[i];
     return match;
 }
 
 
 /********************************************************
-*  Get a view of an C array                             *
-********************************************************/
-template <class TYPE>
-std::shared_ptr<Array<TYPE>> Array<TYPE>::view( size_t N, std::shared_ptr<TYPE> const &data )
+ *  Get a view of an C array                             *
+ ********************************************************/
+template<class TYPE, class FUN>
+std::shared_ptr<Array<TYPE, FUN>> Array<TYPE, FUN>::view(
+    size_t N, std::shared_ptr<TYPE> const &data )
 {
-    return view( std::vector<size_t>{N}, data );
+    return view( ArraySize( N ), data );
 }
-template <class TYPE>
-std::shared_ptr<Array<TYPE>> Array<TYPE>::view(
+template<class TYPE, class FUN>
+std::shared_ptr<Array<TYPE, FUN>> Array<TYPE, FUN>::view(
     size_t N1, size_t N2, std::shared_ptr<TYPE> const &data )
 {
-    return view( std::vector<size_t>{N1,N2}, data );
+    return view( ArraySize( N1, N2 ), data );
 }
-template <class TYPE>
-std::shared_ptr<Array<TYPE>> Array<TYPE>::view(
+template<class TYPE, class FUN>
+std::shared_ptr<Array<TYPE, FUN>> Array<TYPE, FUN>::view(
     size_t N1, size_t N2, size_t N3, std::shared_ptr<TYPE> const &data )
 {
-    return view( std::vector<size_t>{N1,N2,N3}, data );
+    return view( ArraySize( N1, N2, N3 ), data );
 }
-template <class TYPE>
-std::shared_ptr<const Array<TYPE>> Array<TYPE>::constView(
+template<class TYPE, class FUN>
+std::shared_ptr<const Array<TYPE, FUN>> Array<TYPE, FUN>::constView(
     size_t N, std::shared_ptr<const TYPE> const &data )
 {
-    return constView( std::vector<size_t>{N}, data );
+    return constView( ArraySize( N ), data );
 }
-template <class TYPE>
-std::shared_ptr<const Array<TYPE>> Array<TYPE>::constView(
+template<class TYPE, class FUN>
+std::shared_ptr<const Array<TYPE, FUN>> Array<TYPE, FUN>::constView(
     size_t N1, size_t N2, std::shared_ptr<const TYPE> const &data )
 {
-    return constView( std::vector<size_t>{N1,N2}, data );
+    return constView( ArraySize( N1, N2 ), data );
 }
-template <class TYPE>
-std::shared_ptr<const Array<TYPE>> Array<TYPE>::constView(
+template<class TYPE, class FUN>
+std::shared_ptr<const Array<TYPE, FUN>> Array<TYPE, FUN>::constView(
     size_t N1, size_t N2, size_t N3, std::shared_ptr<const TYPE> const &data )
 {
-    return constView( std::vector<size_t>{N1,N2,N3}, data );
+    return constView( ArraySize( N1, N2, N3 ), data );
 }
-template <class TYPE>
-std::shared_ptr<Array<TYPE>> Array<TYPE>::view(
-    const std::vector<size_t> &N, std::shared_ptr<TYPE> const &data )
+template<class TYPE, class FUN>
+std::shared_ptr<Array<TYPE, FUN>> Array<TYPE, FUN>::view(
+    const ArraySize &N, std::shared_ptr<TYPE> const &data )
 {
-    std::shared_ptr<Array<TYPE>> array( new Array<TYPE>() );
-    array->d_ndim   = N.size();
-    array->d_length = 1;
-    for ( size_t i = 0; i < N.size(); i++ ) {
-        array->d_N[i] = N[i];
-        array->d_length *= N[i];
-    }
-    if ( array->d_ndim == 0 )
-        array->d_length = 0;
-    array->d_ptr        = data;
-    array->d_data       = array->d_ptr.get();
+    std::shared_ptr<Array<TYPE, FUN>> array( new Array<TYPE, FUN>() );
+    array->d_size = N;
+    array->d_ptr  = data;
+    array->d_data = array->d_ptr.get();
     return array;
 }
-template <class TYPE>
-std::shared_ptr<const Array<TYPE>> Array<TYPE>::constView(
-    const std::vector<size_t> &N, std::shared_ptr<const TYPE> const &data )
+template<class TYPE, class FUN>
+std::shared_ptr<const Array<TYPE, FUN>> Array<TYPE, FUN>::constView(
+    const ArraySize &N, std::shared_ptr<const TYPE> const &data )
 {
-    return view( N, std::const_pointer_cast<TYPE>( data ) );
+    std::shared_ptr<Array<TYPE, FUN>> array( new Array<TYPE, FUN>() );
+    array->d_size = N;
+    array->d_ptr  = data;
+    array->d_data = array->d_ptr.get();
+    return array;
 }
-template <class TYPE>
-void Array<TYPE>::view2( Array<TYPE> &src )
+template<class TYPE, class FUN>
+void Array<TYPE, FUN>::view2( Array<TYPE, FUN> &src )
 {
     view2( src.size(), src.getPtr() );
     d_data = src.d_data;
 }
-template <class TYPE>
-void Array<TYPE>::view2( const std::vector<size_t> &N, std::shared_ptr<TYPE> const &data )
+template<class TYPE, class FUN>
+void Array<TYPE, FUN>::view2( const ArraySize &N, std::shared_ptr<TYPE> const &data )
 {
-    d_ndim = static_cast<int>( N.size() );
-    for ( size_t i = 0; i < ARRAY_NDIM_MAX; i++ ) {
-        d_N[i] = 1;
-    }
-    d_length = d_ndim == 0 ? 0 : 1;
-    for ( size_t i = 0; i < N.size(); i++ ) {
-        d_N[i] = N[i];
-        d_length *= d_N[i];
-    }
+    d_size = N;
     d_ptr  = data;
     d_data = d_ptr.get();
 }
-
-template <class TYPE>
-void Array<TYPE>::viewRaw( const std::initializer_list<size_t> &N, TYPE *data )
+template<class TYPE, class FUN>
+void Array<TYPE, FUN>::viewRaw( int ndim, const size_t *dims, TYPE *data )
 {
-    d_ndim = static_cast<int>( N.size() );
-    for ( size_t i = 0; i < ARRAY_NDIM_MAX; i++ ) {
-        d_N[i] = 1;
-    }
-    d_length = d_ndim == 0 ? 0 : 1;
-    size_t i = 0;
-    for ( auto it = N.begin(); it != N.end(); ++it, ++i ) {
-        d_N[i] = *it;
-        d_length *= *it;
-    }
+    d_size = ArraySize( ndim, dims );
     d_ptr.reset();
     d_data = data;
 }
-template <class TYPE>
-void Array<TYPE>::viewRaw( const std::vector<size_t> &N, TYPE *data )
+template<class TYPE, class FUN>
+void Array<TYPE, FUN>::viewRaw( const ArraySize &N, TYPE *data )
 {
-    d_ndim = static_cast<int>( N.size() );
-    for ( size_t i = 0; i < ARRAY_NDIM_MAX; i++ ) {
-        d_N[i] = 1;
-    }
-    d_length = d_ndim == 0 ? 0 : 1;
-    size_t i = 0;
-    for ( auto it = N.begin(); it != N.end(); ++it, ++i ) {
-        d_N[i] = *it;
-        d_length *= *it;
-    }
+    d_size = N;
     d_ptr.reset();
     d_data = data;
 }
 
 
 /********************************************************
-*  Convert array types                                  *
-********************************************************/
-template <class TYPE>
-template <class TYPE2>
-std::shared_ptr<Array<TYPE2>> Array<TYPE>::convert( std::shared_ptr<Array<TYPE>> array )
+ *  Convert array types                                  *
+ ********************************************************/
+template<class TYPE, class FUN>
+template<class TYPE2>
+std::shared_ptr<Array<TYPE2>> Array<TYPE, FUN>::convert( std::shared_ptr<Array<TYPE, FUN>> array )
 {
     if ( std::is_same<TYPE, TYPE2>() )
         return array;
@@ -588,161 +675,200 @@ std::shared_ptr<Array<TYPE2>> Array<TYPE>::convert( std::shared_ptr<Array<TYPE>>
     array2.copy( *array );
     return array2;
 }
-template <class TYPE>
-template <class TYPE2>
-std::shared_ptr<const Array<TYPE2>> Array<TYPE>::convert( std::shared_ptr<const Array<TYPE>> array )
+template<class TYPE, class FUN>
+template<class TYPE2>
+std::shared_ptr<const Array<TYPE2>> Array<TYPE, FUN>::convert(
+    std::shared_ptr<const Array<TYPE, FUN>> array )
 {
-    return Array<TYPE>::convert( std::const_pointer_cast<Array<TYPE2>>( array ) );
+    return Array<TYPE, FUN>::convert( std::const_pointer_cast<Array<TYPE2>>( array ) );
 }
-template <class TYPE>
-template <class TYPE2>
-void Array<TYPE>::copy( const Array<TYPE2> &array )
+template<class TYPE, class FUN>
+template<class TYPE2>
+void Array<TYPE, FUN>::copy( const Array<TYPE2> &array )
 {
     resize( array.size() );
     const TYPE2 *src = array.data();
-    for ( size_t i = 0; i < d_length; i++ )
-        d_data[i]  = static_cast<TYPE>( src[i] );
+    for ( size_t i = 0; i < d_size.length(); i++ )
+        d_data[i] = static_cast<TYPE>( src[i] );
 }
-template <class TYPE>
-template <class TYPE2>
-void Array<TYPE>::copy( const TYPE2 *src )
+template<class TYPE, class FUN>
+template<class TYPE2>
+void Array<TYPE, FUN>::copy( const TYPE2 *src )
 {
-    for ( size_t i = 0; i < d_length; i++ )
-        d_data[i]  = static_cast<TYPE>( src[i] );
+    for ( size_t i = 0; i < d_size.length(); i++ )
+        d_data[i] = static_cast<TYPE>( src[i] );
 }
-template <class TYPE>
-template <class TYPE2>
-void Array<TYPE>::copyTo( TYPE2 *dst ) const
+template<class TYPE, class FUN>
+template<class TYPE2>
+void Array<TYPE, FUN>::copyTo( TYPE2 *dst ) const
 {
-    for ( size_t i = 0; i < d_length; i++ )
-        dst[i]     = static_cast<TYPE2>( d_data[i] );
+    for ( size_t i = 0; i < d_size.length(); i++ )
+        dst[i] = static_cast<TYPE2>( d_data[i] );
 }
-template <class TYPE>
-void Array<TYPE>::fill( const TYPE &value )
+template<class TYPE, class FUN>
+template<class TYPE2>
+Array<TYPE2, FUN> Array<TYPE, FUN>::cloneTo() const
 {
-    for ( size_t i = 0; i < d_length; i++ )
-        d_data[i]  = value;
+    Array<TYPE2, FUN> dst( this->size() );
+    auto dst_data = dst.data();
+    for ( size_t i = 0; i < d_size.length(); i++ )
+        dst_data[i] = static_cast<TYPE2>( d_data[i] );
+    return dst;
 }
-template <class TYPE>
-void Array<TYPE>::scale( const TYPE &value )
+template<class TYPE, class FUN>
+void Array<TYPE, FUN>::fill( const TYPE &value )
 {
-    for ( size_t i = 0; i < d_length; i++ )
+    for ( size_t i = 0; i < d_size.length(); i++ )
+        d_data[i] = value;
+}
+template<class TYPE, class FUN>
+void Array<TYPE, FUN>::scale( const TYPE &value )
+{
+    for ( size_t i = 0; i < d_size.length(); i++ )
         d_data[i] *= value;
 }
-template <class TYPE>
-    void Array<TYPE>::pow(const Array<TYPE> &baseArray, const TYPE &exp )
+template<class TYPE, class FUN>
+void Array<TYPE, FUN>::pow( const Array<TYPE, FUN> &baseArray, const TYPE &exp )
 {
     // not insisting on the shapes being the same
     // but insisting on the total size being the same
-    AMP_ASSERT(d_length==baseArray.length());
+    if ( d_size.length() != baseArray.length() )
+        throw std::logic_error( "length of arrays do not match" );
 
     const auto base_data = baseArray.data();
-    for ( size_t i = 0; i < d_length; i++ )
-        d_data[i]  = pow(base_data[i], exp);
+    for ( size_t i = 0; i < d_size.length(); i++ )
+        d_data[i] = std::pow( base_data[i], exp );
 }
 
+
 /********************************************************
-*  Simple math operations                               *
-********************************************************/
-template <class TYPE>
-bool Array<TYPE>::NaNs() const
+ *  Replicate the array                                  *
+ ********************************************************/
+template<class TYPE, class FUN>
+Array<TYPE, FUN> Array<TYPE, FUN>::repmat( const std::vector<size_t> &N_rep ) const
+{
+    std::vector<size_t> N2( d_size.begin(), d_size.end() );
+    if ( N2.size() < N_rep.size() )
+        N2.resize( N_rep.size(), 1 );
+    std::array<size_t, 5> N1, Nr;
+    N1.fill( 1 );
+    Nr.fill( 1 );
+    for ( size_t d = 0; d < N_rep.size(); d++ ) {
+        N1[d] = d_size[d];
+        Nr[d] = N_rep[d];
+        N2[d] *= N_rep[d];
+    }
+    Array<TYPE, FUN> y( N2 );
+    static_assert( ArraySize::maxDim() <= 5, "Not programmed for dimensions > 5" );
+    TYPE *y2 = y.data();
+    for ( size_t i4 = 0, index = 0; i4 < N1[4]; i4++ ) {
+        for ( size_t j4 = 0; j4 < Nr[4]; j4++ ) {
+            for ( size_t i3 = 0; i3 < N1[3]; i3++ ) {
+                for ( size_t j4 = 0; j4 < Nr[3]; j4++ ) {
+                    for ( size_t i2 = 0; i2 < N1[2]; i2++ ) {
+                        for ( size_t j4 = 0; j4 < Nr[2]; j4++ ) {
+                            for ( size_t i1 = 0; i1 < N1[1]; i1++ ) {
+                                for ( size_t j4 = 0; j4 < Nr[1]; j4++ ) {
+                                    for ( size_t i0 = 0; i0 < N1[0]; i0++ ) {
+                                        size_t k = d_size.index( i0, i1, i2, i3, i4 );
+                                        TYPE x   = d_data[k];
+                                        for ( size_t j4 = 0; j4 < Nr[0]; j4++, index++ )
+                                            y2[index] = x;
+                                    }
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
+    return y;
+}
+
+
+/********************************************************
+ *  Simple math operations                               *
+ ********************************************************/
+template<class TYPE, class FUN>
+bool Array<TYPE, FUN>::NaNs() const
 {
     bool test = false;
-    for ( size_t i = 0; i < d_length; i++ )
+    for ( size_t i = 0; i < d_size.length(); i++ )
         test = test || d_data[i] != d_data[i];
     return test;
 }
-template <class TYPE>
-TYPE Array<TYPE>::min() const
+
+template<class TYPE, class FUN>
+TYPE Array<TYPE, FUN>::mean( void ) const
 {
-    TYPE x = std::numeric_limits<TYPE>::max();
-    for ( size_t i = 0; i < d_length; i++ )
-        x = std::min( x, d_data[i] );
+    TYPE x = this->sum() / d_size.length();
     return x;
 }
-template <class TYPE>
-TYPE Array<TYPE>::max() const
+template<class TYPE, class FUN>
+Array<TYPE, FUN> Array<TYPE, FUN>::min( int dir ) const
 {
-    TYPE x = std::numeric_limits<TYPE>::min();
-    for ( size_t i = 0; i < d_length; i++ )
-        x = std::max( x, d_data[i] );
-    return x;
-}
-template <class TYPE>
-TYPE Array<TYPE>::sum() const
-{
-    TYPE x = 0;
-    for ( size_t i = 0; i < d_length; i++ )
-        x += d_data[i];
-    return x;
-}
-template <class TYPE>
-TYPE Array<TYPE>::mean( void ) const
-{
-    TYPE x = sum() / d_length;
-    return x;
-}
-template <class TYPE>
-Array<TYPE> Array<TYPE>::min( int dir ) const
-{
-    std::vector<size_t> size_ans = size();
-    size_ans[dir]                = 1;
-    Array<TYPE> ans( size_ans );
+    auto size_ans = d_size;
+    size_ans.resize( dir, 1 );
+    Array<TYPE, FUN> ans( size_ans );
     size_t N1 = 1, N2 = 1, N3 = 1;
-    for ( int d = 0; d < std::min( dir, d_ndim ); d++ )
-        N1 *= d_N[d];
-    N2 = d_N[dir];
-    for ( int d = dir + 1; d < std::min( d_ndim, ARRAY_NDIM_MAX ); d++ )
-        N3 *= d_N[d];
+    for ( int d = 0; d < std::min<int>( dir, d_size.ndim() ); d++ )
+        N1 *= d_size[d];
+    N2 = d_size[dir];
+    for ( size_t d = dir + 1; d < d_size.ndim(); d++ )
+        N3 *= d_size[d];
     TYPE *data2 = ans.d_data;
     for ( size_t i3 = 0; i3 < N3; i3++ ) {
         for ( size_t i1 = 0; i1 < N1; i1++ ) {
             TYPE x = d_data[i1 + i3 * N1 * N2];
             for ( size_t i2 = 0; i2 < N2; i2++ )
-                x               = std::min( x, d_data[i1 + i2 * N1 + i3 * N1 * N2] );
+                x = std::min( x, d_data[i1 + i2 * N1 + i3 * N1 * N2] );
             data2[i1 + i3 * N1] = x;
         }
     }
     return ans;
 }
-template <class TYPE>
-Array<TYPE> Array<TYPE>::max( int dir ) const
+template<class TYPE, class FUN>
+Array<TYPE, FUN> Array<TYPE, FUN>::max( int dir ) const
 {
-    std::vector<size_t> size_ans = size();
-    size_ans[dir]                = 1;
-    Array<TYPE> ans( size_ans );
+    auto size_ans = d_size;
+    size_ans.resize( dir, 1 );
+    Array<TYPE, FUN> ans( size_ans );
     size_t N1 = 1, N2 = 1, N3 = 1;
-    for ( int d = 0; d < std::min( dir, d_ndim ); d++ )
-        N1 *= d_N[d];
-    N2 = d_N[dir];
-    for ( int d = dir + 1; d < std::min( d_ndim, ARRAY_NDIM_MAX ); d++ )
-        N3 *= d_N[d];
-    TYPE *data2 = ans.d_data;
+    for ( int d = 0; d < std::min<int>( dir, d_size.ndim() ); d++ )
+        N1 *= d_size[d];
+    N2 = d_size[dir];
+    DISABLE_WARNINGS // Suppress false array subscript is above array bounds
+        for ( size_t d = dir + 1; d < d_size.ndim(); d++ ) N3 *= d_size[d];
+    ENABLE_WARNINGS // Enable warnings
+        TYPE *data2 = ans.d_data;
     for ( size_t i3 = 0; i3 < N3; i3++ ) {
         for ( size_t i1 = 0; i1 < N1; i1++ ) {
             TYPE x = d_data[i1 + i3 * N1 * N2];
             for ( size_t i2 = 0; i2 < N2; i2++ )
-                x               = std::max( x, d_data[i1 + i2 * N1 + i3 * N1 * N2] );
+                x = std::max( x, d_data[i1 + i2 * N1 + i3 * N1 * N2] );
             data2[i1 + i3 * N1] = x;
         }
     }
     return ans;
 }
-template <class TYPE>
-Array<TYPE> Array<TYPE>::sum( int dir ) const
+template<class TYPE, class FUN>
+Array<TYPE, FUN> Array<TYPE, FUN>::sum( int dir ) const
 {
-    std::vector<size_t> size_ans = size();
-    size_ans[dir]                = 1;
-    Array<TYPE> ans( size_ans );
+    auto size_ans = d_size;
+    size_ans.resize( dir, 1 );
+    Array<TYPE, FUN> ans( size_ans );
     size_t N1 = 1, N2 = 1, N3 = 1;
-    for ( int d = 0; d < std::min( dir, d_ndim ); d++ )
-        N1 *= d_N[d];
-    N2 = d_N[dir];
-    for ( int d = dir + 1; d < std::min( d_ndim, ARRAY_NDIM_MAX ); d++ )
-        N3 *= d_N[d];
+    for ( int d = 0; d < std::min<int>( dir, d_size.ndim() ); d++ )
+        N1 *= d_size[d];
+    N2 = d_size[dir];
+    DISABLE_WARNINGS
+    for ( size_t d = dir + 1; d < d_size.ndim(); d++ )
+        N3 *= d_size[d];
+    ENABLE_WARNINGS
     TYPE *data2 = ans.d_data;
-    for ( int i3 = 0; i3 < N3; i3++ ) {
-        for ( int i1 = 0; i1 < N1; i1++ ) {
+    for ( size_t i3 = 0; i3 < N3; i3++ ) {
+        for ( size_t i1 = 0; i1 < N1; i1++ ) {
             TYPE x = 0;
             for ( size_t i2 = 0; i2 < N2; i2++ )
                 x += d_data[i1 + i2 * N1 + i3 * N1 * N2];
@@ -751,51 +877,44 @@ Array<TYPE> Array<TYPE>::sum( int dir ) const
     }
     return ans;
 }
-template <class TYPE>
-TYPE Array<TYPE>::min( const std::vector<size_t> &index ) const
+template<class TYPE, class FUN>
+TYPE Array<TYPE, FUN>::min( const std::vector<Range<size_t>> &range ) const
 {
     // Get the subset indicies
-    checkSubsetIndex( index );
-    std::array<size_t, 5> first, last, N1;
-    getSubsetArrays( index, first, last, N1 );
-    std::array<size_t, 5> N2 = getDimArray();
-#if ARRAY_NDIM_MAX > 5
-#error Function programmed for more than 5 dimensions
-#endif
+    checkSubsetIndex( range );
+    std::array<size_t, 5> first, last, inc, N1;
+    getSubsetArrays( range, first, last, inc, N1 );
+    static_assert( ArraySize::maxDim() <= 5, "Function programmed for more than 5 dimensions" );
     TYPE x = std::numeric_limits<TYPE>::max();
-    for ( size_t i4 = first[4]; i4 <= last[4]; i4++ ) {
-        for ( size_t i3 = first[3]; i3 <= last[3]; i3++ ) {
-            for ( size_t i2 = first[2]; i2 <= last[2]; i2++ ) {
-                for ( size_t i1 = first[1]; i1 <= last[1]; i1++ ) {
-                    for ( size_t i0 = first[0]; i0 <= last[0]; i0++ ) {
-                        size_t k1 = GET_ARRAY_INDEX5D( N2, i0, i1, i2, i3, i4 );
+    for ( size_t i4 = first[4]; i4 <= last[4]; i4 += inc[4] ) {
+        for ( size_t i3 = first[3]; i3 <= last[3]; i3 += inc[3] ) {
+            for ( size_t i2 = first[2]; i2 <= last[2]; i2 += inc[2] ) {
+                for ( size_t i1 = first[1]; i1 <= last[1]; i1 += inc[1] ) {
+                    for ( size_t i0 = first[0]; i0 <= last[0]; i0 += inc[0] ) {
+                        size_t k1 = d_size.index( i0, i1, i2, i3, i4 );
                         x         = std::min( x, d_data[k1] );
                     }
                 }
             }
         }
     }
-
     return x;
 }
-template <class TYPE>
-TYPE Array<TYPE>::max( const std::vector<size_t> &index ) const
+template<class TYPE, class FUN>
+TYPE Array<TYPE, FUN>::max( const std::vector<Range<size_t>> &range ) const
 {
     // Get the subset indicies
-    checkSubsetIndex( index );
-    std::array<size_t, 5> first, last, N1;
-    getSubsetArrays( index, first, last, N1 );
-    std::array<size_t, 5> N2 = getDimArray();
-#if ARRAY_NDIM_MAX > 5
-#error Function programmed for more than 5 dimensions
-#endif
+    checkSubsetIndex( range );
+    std::array<size_t, 5> first, last, inc, N1;
+    getSubsetArrays( range, first, last, inc, N1 );
+    static_assert( ArraySize::maxDim() <= 5, "Function programmed for more than 5 dimensions" );
     TYPE x = std::numeric_limits<TYPE>::min();
-    for ( size_t i4 = first[4]; i4 <= last[4]; i4++ ) {
-        for ( size_t i3 = first[3]; i3 <= last[3]; i3++ ) {
-            for ( size_t i2 = first[2]; i2 <= last[2]; i2++ ) {
-                for ( size_t i1 = first[1]; i1 <= last[1]; i1++ ) {
-                    for ( size_t i0 = first[0]; i0 <= last[0]; i0++ ) {
-                        size_t k1 = GET_ARRAY_INDEX5D( N2, i0, i1, i2, i3, i4 );
+    for ( size_t i4 = first[4]; i4 <= last[4]; i4 += inc[4] ) {
+        for ( size_t i3 = first[3]; i3 <= last[3]; i3 += inc[3] ) {
+            for ( size_t i2 = first[2]; i2 <= last[2]; i2 += inc[2] ) {
+                for ( size_t i1 = first[1]; i1 <= last[1]; i1 += inc[1] ) {
+                    for ( size_t i0 = first[0]; i0 <= last[0]; i0 += inc[0] ) {
+                        size_t k1 = d_size.index( i0, i1, i2, i3, i4 );
                         x         = std::max( x, d_data[k1] );
                     }
                 }
@@ -804,24 +923,21 @@ TYPE Array<TYPE>::max( const std::vector<size_t> &index ) const
     }
     return x;
 }
-template <class TYPE>
-TYPE Array<TYPE>::sum( const std::vector<size_t> &index ) const
+template<class TYPE, class FUN>
+TYPE Array<TYPE, FUN>::sum( const std::vector<Range<size_t>> &range ) const
 {
     // Get the subset indicies
-    checkSubsetIndex( index );
-    std::array<size_t, 5> first, last, N1;
-    getSubsetArrays( index, first, last, N1 );
-    std::array<size_t, 5> N2 = getDimArray();
-#if ARRAY_NDIM_MAX > 5
-#error Function programmed for more than 5 dimensions
-#endif
+    checkSubsetIndex( range );
+    std::array<size_t, 5> first, last, inc, N1;
+    getSubsetArrays( range, first, last, inc, N1 );
+    static_assert( ArraySize::maxDim() <= 5, "Function programmed for more than 5 dimensions" );
     TYPE x = 0;
-    for ( size_t i4 = first[4]; i4 <= last[4]; i4++ ) {
-        for ( size_t i3 = first[3]; i3 <= last[3]; i3++ ) {
-            for ( size_t i2 = first[2]; i2 <= last[2]; i2++ ) {
-                for ( size_t i1 = first[1]; i1 <= last[1]; i1++ ) {
-                    for ( size_t i0 = first[0]; i0 <= last[0]; i0++ ) {
-                        size_t k1 = GET_ARRAY_INDEX5D( N2, i0, i1, i2, i3, i4 );
+    for ( size_t i4 = first[4]; i4 <= last[4]; i4 += inc[4] ) {
+        for ( size_t i3 = first[3]; i3 <= last[3]; i3 += inc[3] ) {
+            for ( size_t i2 = first[2]; i2 <= last[2]; i2 += inc[2] ) {
+                for ( size_t i1 = first[1]; i1 <= last[1]; i1 += inc[1] ) {
+                    for ( size_t i0 = first[0]; i0 <= last[0]; i0 += inc[0] ) {
+                        size_t k1 = d_size.index( i0, i1, i2, i3, i4 );
                         x += d_data[k1];
                     }
                 }
@@ -830,105 +946,56 @@ TYPE Array<TYPE>::sum( const std::vector<size_t> &index ) const
     }
     return x;
 }
-template <class TYPE>
-TYPE Array<TYPE>::mean( const std::vector<size_t> &index ) const
+template<class TYPE, class FUN>
+TYPE Array<TYPE, FUN>::mean( const std::vector<Range<size_t>> &range ) const
 {
     // Get the subset indicies
-    checkSubsetIndex( index );
-    std::array<size_t, 5> first, last, N1;
-    getSubsetArrays( index, first, last, N1 );
-#if ARRAY_NDIM_MAX > 5
-#error Function programmed for more than 5 dimensions
-#endif
+    checkSubsetIndex( range );
+    std::array<size_t, 5> first, last, inc, N1;
+    getSubsetArrays( range, first, last, inc, N1 );
+    static_assert( ArraySize::maxDim() <= 5, "Function programmed for more than 5 dimensions" );
     size_t n = 1;
     for ( auto &d : N1 )
         n *= d;
-    TYPE x = sum( index ) / n;
+    TYPE x = sum( range ) / n;
     return x;
 }
-
-template <class TYPE>
-Array<TYPE> &Array<TYPE>::operator+=( const Array<TYPE> &rhs )
+template<class TYPE, class FUN>
+TYPE Array<TYPE, FUN>::min( const std::vector<size_t> &index ) const
 {
-    if ( !sizeMatch(rhs) )
-        throw std::logic_error( "Array don't match" );
-    for ( size_t i = 0; i < d_length; i++ )
-        d_data[i] += rhs.d_data[i];
-    return *this;
+    auto range = convert( index );
+    return min( range );
 }
-template <class TYPE>
-Array<TYPE> &Array<TYPE>::operator-=( const Array<TYPE> &rhs )
+template<class TYPE, class FUN>
+TYPE Array<TYPE, FUN>::max( const std::vector<size_t> &index ) const
 {
-    if ( !sizeMatch(rhs) )
-        throw std::logic_error( "Array don't match" );
-    for ( size_t i = 0; i < d_length; i++ )
-        d_data[i] -= rhs.d_data[i];
-    return *this;
+    auto range = convert( index );
+    return max( range );
 }
-template <class TYPE>
-Array<TYPE> &Array<TYPE>::operator+=( const TYPE &rhs )
+template<class TYPE, class FUN>
+TYPE Array<TYPE, FUN>::sum( const std::vector<size_t> &index ) const
 {
-    for ( size_t i = 0; i < d_length; i++ )
-        d_data[i] += rhs;
-    return *this;
+    auto range = convert( index );
+    return sum( range );
 }
-template <class TYPE>
-Array<TYPE> &Array<TYPE>::operator-=( const TYPE &rhs )
+template<class TYPE, class FUN>
+TYPE Array<TYPE, FUN>::mean( const std::vector<size_t> &index ) const
 {
-    for ( size_t i = 0; i < d_length; i++ )
-        d_data[i] -= rhs;
-    return *this;
-}
-template <class TYPE>
-Array<TYPE> operator+( const Array<TYPE>& a, const Array<TYPE>& b )
-{
-    Array<TYPE> c = a;
-    c += b;
-    return c;
-}
-template <class TYPE>
-Array<TYPE> operator-( const Array<TYPE>& a, const Array<TYPE>& b )
-{
-    Array<TYPE> c = a;
-    c -= b;
-    return c;
-}
-template <class TYPE>
-Array<TYPE> operator*( const Array<TYPE>& a, const Array<TYPE>& b )
-{
-    return Array<TYPE>::multiply(a,b);
-}
-template <class TYPE>
-Array<TYPE> Array<TYPE>::multiply( const Array<TYPE>& a, const Array<TYPE>& b )
-{
-    Array<TYPE> c;
-    if ( a.d_ndim==2 && b.d_ndim==2 ) {
-        c.resize( a.size(0), b.size(1) );
-        c.fill(0);
-        for (size_t k=0; k<b.size(1); k++) {
-            for (size_t j=0; j<a.size(1); j++) {
-                for (size_t i=0; i<a.size(0); i++) {
-                    c(i,k) += a(i,j) * b(j,k);
-                }
-            }
-        }
-    } else {
-        throw std::logic_error("Not finished yet");
-    }
-    return c;
+    auto range = convert( index );
+    return mean( range );
 }
 
 
 /********************************************************
-*  Find all elements that match the given operation     *
-********************************************************/
-template <class TYPE>
-std::vector<size_t> Array<TYPE>::find(
+ *  Find all elements that match the given operation     *
+ ********************************************************/
+template<class TYPE, class FUN>
+std::vector<size_t> Array<TYPE, FUN>::find(
     const TYPE &value, std::function<bool( const TYPE &, const TYPE & )> compare ) const
 {
     std::vector<size_t> result;
-    result.reserve( d_length );
-    for ( size_t i = 0; i < d_length; i++ ) {
+    result.reserve( d_size.length() );
+    for ( size_t i = 0; i < d_size.length(); i++ ) {
         if ( compare( d_data[i], value ) )
             result.push_back( i );
     }
@@ -937,115 +1004,122 @@ std::vector<size_t> Array<TYPE>::find(
 
 
 /********************************************************
-*  Print an array to an output stream                   *
-********************************************************/
-template <class TYPE>
-void Array<TYPE>::print( std::ostream& os, const std::string& name, const std::string& prefix ) const
+ *  Print an array to an output stream                   *
+ ********************************************************/
+template<class TYPE, class FUN>
+void Array<TYPE, FUN>::print(
+    std::ostream &os, const std::string &name, const std::string &prefix ) const
 {
-    if ( d_ndim==1 ) {
-        for (size_t i=0; i<d_N[0]; i++)
-            os << prefix << name << "[" << i << "] = " << operator()(i) << std::endl;
-    } else if ( d_ndim==2 ) {
+    if ( d_size.ndim() == 1 ) {
+        for ( size_t i = 0; i < d_size[0]; i++ )
+            os << prefix << name << "[" << i << "] = " << d_data[i] << std::endl;
+    } else if ( d_size.ndim() == 2 ) {
         os << prefix << name << ":" << std::endl;
-        for (size_t i=0; i<d_N[0]; i++) {
-            for (size_t j=0; j<d_N[1]; j++)
-                os << prefix << "  " << operator()(i,j);
+        for ( size_t i = 0; i < d_size[0]; i++ ) {
+            for ( size_t j = 0; j < d_size[1]; j++ )
+                os << prefix << "  " << operator()( i, j );
             os << std::endl;
         }
     } else {
-        throw std::logic_error("Not programmed for this dimension");
+        throw std::logic_error( "Not programmed for this dimension" );
     }
 }
 
 
 /********************************************************
-*  Reverse dimensions (transpose)                       *
-********************************************************/
-template <class TYPE>
-Array<TYPE> Array<TYPE>::reverseDim( ) const
+ *  Reverse dimensions (transpose)                       *
+ ********************************************************/
+template<class TYPE, class FUN>
+Array<TYPE, FUN> Array<TYPE, FUN>::reverseDim() const
 {
-    std::vector<size_t> N2(ARRAY_NDIM_MAX);
-    for ( int d=0; d<ARRAY_NDIM_MAX; d++)
-        N2[d] = d_N[ARRAY_NDIM_MAX-d-1];
-    Array<TYPE> y( N2 );
-#if ARRAY_NDIM_MAX != 5
-    #error Function programmed for dimensions other than 5
-#endif
-    TYPE* y2 = y.data();
-    for (size_t i0=0; i0<d_N[0]; i0++) {
-        for (size_t i1=0; i1<d_N[1]; i1++) {
-            for (size_t i2=0; i2<d_N[2]; i2++) {
-                for (size_t i3=0; i3<d_N[3]; i3++) {
-                    for (size_t i4=0; i4<d_N[4]; i4++) {
-                        y2[GET_ARRAY_INDEX5D(N2,i4,i3,i2,i1,i0)] = d_data[GET_ARRAY_INDEX5D(d_N,i0,i1,i2,i3,i4)];
+    size_t N2[ArraySize::maxDim()];
+    for ( int d = 0; d < ArraySize::maxDim(); d++ )
+        N2[d] = d_size[ArraySize::maxDim() - d - 1];
+    ArraySize S2( ArraySize::maxDim(), N2 );
+    Array<TYPE, FUN> y( S2 );
+    static_assert( ArraySize::maxDim() == 5, "Not programmed for dimensions other than 5" );
+    TYPE *y2 = y.data();
+    for ( size_t i0 = 0; i0 < d_size[0]; i0++ ) {
+        for ( size_t i1 = 0; i1 < d_size[1]; i1++ ) {
+            for ( size_t i2 = 0; i2 < d_size[2]; i2++ ) {
+                for ( size_t i3 = 0; i3 < d_size[3]; i3++ ) {
+                    for ( size_t i4 = 0; i4 < d_size[4]; i4++ ) {
+                        y2[S2.index( i4, i3, i2, i1, i0 )] =
+                            d_data[d_size.index( i0, i1, i2, i3, i4 )];
                     }
                 }
             }
         }
     }
-    auto S2 = size();
-    for ( int d=0; d<d_ndim; d++)
-        S2[d] = size(d_ndim-d-1);
-    y.reshape( S2 );
+    for ( int d = 0; d < d_size.ndim(); d++ )
+        N2[d] = d_size[d_size.ndim() - d - 1];
+    y.reshape( ArraySize( d_size.ndim(), N2 ) );
     return y;
 }
 
 
 /********************************************************
-*  Coarsen the array                                    *
-********************************************************/
-template <class TYPE>
-Array<TYPE> Array<TYPE>::coarsen( const Array<TYPE>& filter ) const
+ *  Coarsen the array                                    *
+ ********************************************************/
+template<class TYPE, class FUN>
+Array<TYPE, FUN> Array<TYPE, FUN>::coarsen( const Array<TYPE, FUN> &filter ) const
 {
     auto S2 = size();
-    for (size_t i=0; i<S2.size(); i++) {
-        S2[i] /= filter.size(i);
-        INSIST(S2[i]*filter.size(i)==size(i),"Array must be multiple of filter size");
+    for ( size_t i = 0; i < S2.size(); i++ ) {
+        S2[i] /= filter.size( i );
+        if ( S2[i] * filter.size( i ) != size( i ) )
+            throw std::invalid_argument( "Array must be multiple of filter size" );
     }
-    Array<TYPE> y( S2 );
-    INSIST(d_ndim<=3,"Function programmed for more than 5 dimensions");
-    const size_t *Nh = filter.d_N;
-    for (size_t k1=0; k1<y.d_N[2]; k1++) {
-        for (size_t j1=0; j1<y.d_N[1]; j1++) {
-            for (size_t i1=0; i1<y.d_N[0]; i1++) {
+    Array<TYPE, FUN> y( S2 );
+    if ( d_size.ndim() <= 3 )
+        throw std::logic_error( "Function programmed for more than 3 dimensions" );
+    const auto& Nh = filter.d_size;
+    for ( size_t k1 = 0; k1 < y.d_size[2]; k1++ ) {
+        for ( size_t j1 = 0; j1 < y.d_size[1]; j1++ ) {
+            for ( size_t i1 = 0; i1 < y.d_size[0]; i1++ ) {
                 TYPE tmp = 0;
-                for (size_t k2=0; k2<Nh[2]; k2++) {
-                    for (size_t j2=0; j2<Nh[1]; j2++) {
-                        for (size_t i2=0; i2<Nh[0]; i2++) {
-                            tmp += filter(i2,j2,k2) * this->operator()(i1*Nh[0]+i2,j1*Nh[1]+j2,k1*Nh[2]+k2);
+                for ( size_t k2 = 0; k2 < Nh[2]; k2++ ) {
+                    for ( size_t j2 = 0; j2 < Nh[1]; j2++ ) {
+                        for ( size_t i2 = 0; i2 < Nh[0]; i2++ ) {
+                            tmp += filter( i2, j2, k2 ) * this->operator()( i1 *Nh[0] + i2,
+                                                              j1 * Nh[1] + j2, k1 * Nh[2] + k2 );
                         }
                     }
                 }
-                y(i1,j1,k1) = tmp;
+                y( i1, j1, k1 ) = tmp;
             }
         }
     }
     return y;
 }
-template <class TYPE>
-Array<TYPE> Array<TYPE>::coarsen( const std::vector<size_t>& ratio, std::function<TYPE(const Array<TYPE>&)> filter ) const
+template<class TYPE, class FUN>
+Array<TYPE, FUN> Array<TYPE, FUN>::coarsen(
+    const std::vector<size_t> &ratio, std::function<TYPE( const Array<TYPE, FUN> & )> filter ) const
 {
-    ASSERT((int)ratio.size()==d_ndim);
+    if ( ratio.size() != d_size.ndim() )
+        throw std::logic_error( "ratio size does not match ndim" );
     auto S2 = size();
-    for (size_t i=0; i<S2.size(); i++) {
-        S2[i] /= ratio[i];
-        INSIST(S2[i]*ratio[i]==size(i),"Array must be multiple of filter size");
+    for ( size_t i = 0; i < S2.size(); i++ ) {
+        S2.resize( i, S2[i] / ratio[i] );
+        if ( S2[i] * ratio[i] != size( i ) )
+            throw std::invalid_argument( "Array must be multiple of filter size" );
     }
-    Array<TYPE> tmp(ratio);
-    TYPE* tmp2 = tmp.data();
-    Array<TYPE> y( S2 );
-    INSIST(d_ndim<=3,"Function programmed for more than 3 dimensions");
-    for (size_t k1=0; k1<y.d_N[2]; k1++) {
-        for (size_t j1=0; j1<y.d_N[1]; j1++) {
-            for (size_t i1=0; i1<y.d_N[0]; i1++) {
-                for (size_t k2=0; k2<ratio[2]; k2++) {
-                    for (size_t j2=0; j2<ratio[1]; j2++) {
-                        for (size_t i2=0; i2<ratio[0]; i2++) {
-                            tmp2[GET_ARRAY_INDEX3D(tmp.d_N,i2,j2,k2)] = this->operator()(i1*ratio[0]+i2,j1*ratio[1]+j2,k1*ratio[2]+k2);
+    Array<TYPE, FUN> tmp( ratio );
+    Array<TYPE, FUN> y( S2 );
+    if ( d_size.ndim() <= 3 )
+        throw std::logic_error( "Function programmed for more than 3 dimensions" );
+    for ( size_t k1 = 0; k1 < y.d_size[2]; k1++ ) {
+        for ( size_t j1 = 0; j1 < y.d_size[1]; j1++ ) {
+            for ( size_t i1 = 0; i1 < y.d_size[0]; i1++ ) {
+                for ( size_t k2 = 0; k2 < ratio[2]; k2++ ) {
+                    for ( size_t j2 = 0; j2 < ratio[1]; j2++ ) {
+                        for ( size_t i2 = 0; i2 < ratio[0]; i2++ ) {
+                            tmp( i2, j2, k2 ) = this->operator()(
+                                i1 *ratio[0] + i2, j1 * ratio[1] + j2, k1 * ratio[2] + k2 );
                         }
                     }
                 }
-                y(i1,j1,k1) = filter(tmp);
+                y( i1, j1, k1 ) = filter( tmp );
             }
         }
     }
@@ -1053,4 +1127,172 @@ Array<TYPE> Array<TYPE>::coarsen( const std::vector<size_t>& ratio, std::functio
 }
 
 
+/********************************************************
+ *  Concatenates the arrays                              *
+ ********************************************************/
+template<class TYPE, class FUN>
+void Array<TYPE, FUN>::cat( const Array<TYPE, FUN> &x, int dim )
+{
+    std::vector<Array<TYPE, FUN>> tmp( 2 );
+    tmp[0].view2( *this );
+    tmp[1].view2( const_cast<Array<TYPE, FUN> &>( x ) );
+    *this = cat( tmp, dim );
+}
+template<class TYPE, class FUN>
+Array<TYPE, FUN> Array<TYPE, FUN>::cat( const std::vector<Array> &x, int dim )
+{
+    if ( x.empty() )
+        return Array<TYPE, FUN>();
+    // Check that the dimensions match
+    bool check = true;
+    for ( size_t i = 1; i < x.size(); i++ ) {
+        check = check && x[i].ndim() == x[0].ndim();
+        for ( int d = 0; d < x[0].ndim(); d++ )
+            check = check && d == dim;
+    }
+    if ( !check )
+        throw std::logic_error( "Array dimensions do not match for concatenation" );
+    // Create the output array
+    auto size = x[0].d_size;
+    for ( size_t i = 1; i < x.size(); i++ )
+        size.resize( dim, size[dim] + x[i].size( dim ) );
+    Array<TYPE, FUN> out( size );
+    size_t N1 = 1;
+    size_t N2 = size[dim];
+    size_t N3 = 1;
+    for ( int d = 0; d < dim; d++ )
+        N1 *= size[d];
+    for ( size_t d = dim + 1; d < size.ndim(); d++ )
+        N3 *= size[d];
+    TYPE *data = out.data();
+    for ( size_t i = 0, i0 = 0; i < x.size(); i++ ) {
+        const TYPE *src = x[i].data();
+        size_t N22      = x[i].size( dim );
+        for ( size_t j2 = 0; j2 < N3; j2++ ) {
+            for ( size_t i1 = 0; i1 < N22; i1++ ) {
+                for ( size_t j1 = 0; j1 < N1; j1++ ) {
+                    data[j1 + ( i1 + i0 ) * N1 + j2 * N1 * N2] = src[j1 + i1 * N1 + j2 * N1 * N22];
+                }
+            }
+        }
+        i0 += N22;
+    }
+    return out;
+}
+
+
+/********************************************************
+ *  Math operations (should call the Math class)         *
+ ********************************************************/
+template<class TYPE, class FUN>
+void Array<TYPE, FUN>::rand()
+{
+    FUN::rand( *this );
+}
+template<class TYPE, class FUN>
+Array<TYPE, FUN> &Array<TYPE, FUN>::operator+=( const Array<TYPE, FUN> &rhs )
+{
+    const auto &fun = []( const TYPE &a, const TYPE &b ) { return a + b; };
+    FUN::transform( fun, *this, rhs, *this );
+    return *this;
+}
+template<class TYPE, class FUN>
+Array<TYPE, FUN> &Array<TYPE, FUN>::operator-=( const Array<TYPE, FUN> &rhs )
+{
+    const auto &fun = []( const TYPE &a, const TYPE &b ) { return a - b; };
+    FUN::transform( fun, *this, rhs, *this );
+    return *this;
+}
+template<class TYPE, class FUN>
+Array<TYPE, FUN> &Array<TYPE, FUN>::operator+=( const TYPE &rhs )
+{
+    const auto &fun = [rhs]( const TYPE &x ) { return x + rhs; };
+    FUN::transform( fun, *this, *this );
+    return *this;
+}
+template<class TYPE, class FUN>
+Array<TYPE, FUN> &Array<TYPE, FUN>::operator-=( const TYPE &rhs )
+{
+    const auto &fun = [rhs]( const TYPE &x ) { return x - rhs; };
+    FUN::transform( fun, *this, *this );
+    return *this;
+}
+template<class TYPE, class FUN>
+Array<TYPE, FUN> operator+( const Array<TYPE, FUN> &a, const Array<TYPE, FUN> &b )
+{
+    Array<TYPE, FUN> c;
+    const auto &fun = []( const TYPE &a, const TYPE &b ) { return a + b; };
+    FUN::transform( fun, a, b, c );
+    return c;
+}
+template<class TYPE, class FUN>
+Array<TYPE, FUN> operator-( const Array<TYPE, FUN> &a, const Array<TYPE, FUN> &b )
+{
+    Array<TYPE, FUN> c;
+    const auto &fun = []( const TYPE &a, const TYPE &b ) { return a - b; };
+    FUN::transform( fun, a, b, c );
+    return c;
+}
+template<class TYPE, class FUN>
+Array<TYPE, FUN> operator*( const Array<TYPE, FUN> &a, const Array<TYPE, FUN> &b )
+{
+    return Array<TYPE, FUN>::multiply( a, b );
+}
+template<class TYPE, class FUN>
+inline Array<TYPE, FUN> operator*( const Array<TYPE, FUN> &a, const std::vector<TYPE> &b )
+{
+    Array<TYPE, FUN> b2;
+    b2.viewRaw( { b.size() }, const_cast<TYPE *>( b.data() ) );
+    return Array<TYPE, FUN>::multiply( a, b2 );
+}
+template<class TYPE, class FUN>
+TYPE Array<TYPE, FUN>::min() const
+{
+    const auto &fun = []( const TYPE &a, const TYPE &b ) { return a < b ? a : b; };
+    return FUN::reduce( fun, *this );
+}
+template<class TYPE, class FUN>
+TYPE Array<TYPE, FUN>::max() const
+{
+    const auto &fun = []( const TYPE &a, const TYPE &b ) { return a > b ? a : b; };
+    return FUN::reduce( fun, *this );
+}
+template<class TYPE, class FUN>
+TYPE Array<TYPE, FUN>::sum() const
+{
+    const auto &fun = []( const TYPE &a, const TYPE &b ) { return a + b; };
+    return FUN::reduce( fun, *this );
+}
+template<class TYPE, class FUN>
+Array<TYPE, FUN> Array<TYPE, FUN>::multiply( const Array<TYPE, FUN> &a, const Array<TYPE, FUN> &b )
+{
+    Array<TYPE, FUN> c;
+    FUN::multiply( a, b, c );
+    return c;
+}
+template<class TYPE, class FUN>
+void Array<TYPE, FUN>::axpby( const TYPE &alpha, const Array<TYPE, FUN> &x, const TYPE &beta )
+{
+    const auto &fun = [alpha, beta](
+                          const TYPE &x, const TYPE &y ) { return alpha * x + beta * y; };
+    return FUN::transform( fun, x, *this );
+}
+template<class TYPE, class FUN>
+Array<TYPE, FUN> Array<TYPE, FUN>::transform(
+    std::function<TYPE( const TYPE & )> fun, const Array<TYPE, FUN> &x )
+{
+    Array<TYPE, FUN> y;
+    FUN::transform( fun, x, y );
+    return y;
+}
+template<class TYPE, class FUN>
+Array<TYPE, FUN> Array<TYPE, FUN>::transform( std::function<TYPE( const TYPE &, const TYPE & )> fun,
+    const Array<TYPE, FUN> &x, const Array<TYPE, FUN> &y )
+{
+    Array<TYPE, FUN> z;
+    FUN::transform( fun, x, y, z );
+    return z;
+}
+
+
 #endif
diff --git a/common/FunctionTable.h b/common/FunctionTable.h
new file mode 100644
index 00000000..e2bdcb67
--- /dev/null
+++ b/common/FunctionTable.h
@@ -0,0 +1,81 @@
+#ifndef included_FunctionTable
+#define included_FunctionTable
+
+
+#include "common/Array.h"
+
+#include <functional>
+
+
+/*!
+ * Class FunctionTable is a serial function table class that defines
+ *   a series of operations that can be performed on the Array class.
+ *   Users can impliment additional versions of the function table that match
+ *   the interface to change the behavior of the array class.
+ */
+class FunctionTable final
+{
+public:
+    /*!
+     * Initialize the array with random values
+     * @param[in] x         The array to operate on
+     */
+    template<class TYPE, class FUN>
+    static void rand( Array<TYPE, FUN> &x );
+
+    /*!
+     * Perform a reduce operator y = f(x)
+     * @param[in] op        The function operation
+     *                      Note: the operator is a template parameter
+     *                      (compared to a std::function to improve performance)
+     * @param[in] A         The array to operate on
+     * @return              The reduction
+     */
+    template<class TYPE, class FUN, typename LAMBDA>
+    static inline TYPE reduce( LAMBDA &op, const Array<TYPE, FUN> &A );
+
+    /*!
+     * Perform a element-wise operation y = f(x)
+     * @param[in] fun       The function operation
+     *                      Note: the operator is a template parameter
+     *                      (compared to a std::function to improve performance)
+     * @param[in] x         The input array to operate on
+     * @param[out] y        The output array
+     */
+    template<class TYPE, class FUN, typename LAMBDA>
+    static inline void transform( LAMBDA &fun, const Array<TYPE, FUN> &x, Array<TYPE, FUN> &y );
+
+    /*!
+     * Perform a element-wise operation z = f(x,y)
+     * @param[in] fun       The function operation
+     *                      Note: the operator is a template parameter
+     *                      (compared to a std::function to improve performance)
+     * @param[in] x         The first array
+     * @param[in] y         The second array
+     * @param[out] z        The result
+     */
+    template<class TYPE, class FUN, typename LAMBDA>
+    static inline void transform(
+        LAMBDA &fun, const Array<TYPE, FUN> &x, const Array<TYPE, FUN> &y, Array<TYPE, FUN> &z );
+
+    /*!
+     * Multiply two arrays
+     * @param[in] a             The first array
+     * @param[in] b             The second array
+     * @param[out] c            The output array
+     */
+    template<class TYPE, class FUN>
+    static void multiply(
+        const Array<TYPE, FUN> &a, const Array<TYPE, FUN> &b, Array<TYPE, FUN> &c );
+
+
+private:
+    FunctionTable();
+
+    template<class T>
+    static inline void rand( size_t N, T *x );
+};
+
+#include "common/FunctionTable.hpp"
+
+#endif
diff --git a/common/FunctionTable.hpp b/common/FunctionTable.hpp
new file mode 100644
index 00000000..52897d5c
--- /dev/null
+++ b/common/FunctionTable.hpp
@@ -0,0 +1,116 @@
+#ifndef included_FunctionTable_hpp
+#define included_FunctionTable_hpp
+
+#include "common/FunctionTable.h"
+#include "common/Utilities.h"
+
+#include <algorithm>
+#include <cstring>
+#include <limits>
+#include <random>
+
+
+/********************************************************
+ *  Random number initialization                         *
+ ********************************************************/
+template<class TYPE, class FUN>
+void FunctionTable::rand( Array<TYPE, FUN> &x )
+{
+    FunctionTable::rand<TYPE>( x.length(), x.data() );
+}
+template<>
+inline void FunctionTable::rand<double>( size_t N, double *x )
+{
+    std::random_device rd;
+    std::mt19937 gen( rd() );
+    std::uniform_real_distribution<> dis( 0, 1 );
+    for ( size_t i = 0; i < N; i++ )
+        x[i] = dis( gen );
+}
+template<>
+inline void FunctionTable::rand<float>( size_t N, float *x )
+{
+    std::random_device rd;
+    std::mt19937 gen( rd() );
+    std::uniform_real_distribution<> dis( 0, 1 );
+    for ( size_t i = 0; i < N; i++ )
+        x[i] = dis( gen );
+}
+template<>
+inline void FunctionTable::rand<int>( size_t N, int *x )
+{
+    std::random_device rd;
+    std::mt19937 gen( rd() );
+    std::uniform_int_distribution<> dis;
+    for ( size_t i = 0; i < N; i++ )
+        x[i] = dis( gen );
+}
+
+
+/********************************************************
+ *  Reduction                                            *
+ ********************************************************/
+template<class TYPE, class FUN, typename LAMBDA>
+inline TYPE FunctionTable::reduce( LAMBDA &op, const Array<TYPE, FUN> &A )
+{
+    if ( A.length() == 0 )
+        return TYPE();
+    const TYPE *x  = A.data();
+    TYPE y         = x[0];
+    const size_t N = A.length();
+    for ( size_t i = 1; i < N; i++ )
+        y = op( x[i], y );
+    return y;
+}
+
+
+/********************************************************
+ *  Unary transformation                                 *
+ ********************************************************/
+template<class TYPE, class FUN, typename LAMBDA>
+inline void FunctionTable::transform( LAMBDA &fun, const Array<TYPE, FUN> &x, Array<TYPE, FUN> &y )
+{
+    y.resize( x.size() );
+    const size_t N = x.length();
+    for ( size_t i = 0; i < N; i++ )
+        y( i ) = fun( x( i ) );
+}
+template<class TYPE, class FUN, typename LAMBDA>
+inline void FunctionTable::transform(
+    LAMBDA &fun, const Array<TYPE, FUN> &x, const Array<TYPE, FUN> &y, Array<TYPE, FUN> &z )
+{
+    if ( !x.sizeMatch( y ) )
+        throw std::logic_error( "Sizes of x and y do not match" );
+    z.resize( x.size() );
+    const size_t N = x.length();
+    for ( size_t i = 0; i < N; i++ )
+        z( i ) = fun( x( i ), y( i ) );
+}
+
+
+/********************************************************
+ *  Multiply two arrays                                  *
+ ********************************************************/
+template<class TYPE, class FUN>
+void FunctionTable::multiply(
+    const Array<TYPE, FUN> &a, const Array<TYPE, FUN> &b, Array<TYPE, FUN> &c )
+{
+    if ( a.ndim() <= 2 && b.ndim() <= 2 ) {
+        if ( a.size( 1 ) != b.size( 0 ) )
+            throw std::logic_error( "Inner dimensions must match" );
+        c.resize( a.size( 0 ), b.size( 1 ) );
+        c.fill( 0 );
+        for ( size_t k = 0; k < b.size( 1 ); k++ ) {
+            for ( size_t j = 0; j < a.size( 1 ); j++ ) {
+                for ( size_t i = 0; i < a.size( 0 ); i++ ) {
+                    c( i, k ) += a( i, j ) * b( j, k );
+                }
+            }
+        }
+    } else {
+        throw std::logic_error( "Not finished yet" );
+    }
+}
+
+
+#endif
diff --git a/common/StackTrace.cpp b/common/StackTrace.cpp
index 9786644e..8b9e4015 100644
--- a/common/StackTrace.cpp
+++ b/common/StackTrace.cpp
@@ -4,14 +4,17 @@
 #include <csignal>
 #include <cstring>
 #include <iostream>
-#include <set>
 #include <map>
+#include <memory>
 #include <mutex>
+#include <random>
+#include <set>
 #include <sstream>
 #include <stdexcept>
 #include <thread>
-#include <memory>
-#include <random>
+
+
+#define perr std::cerr
 
 
 // Detect the OS
@@ -22,7 +25,7 @@
 #elif defined( __APPLE__ )
     #define USE_MAC
     #define USE_NM
-#elif defined( __linux ) || defined( __unix ) || defined( __posix )
+#elif defined( __linux ) || defined( __linux__ ) || defined( __unix ) || defined( __posix )
     #define USE_LINUX
     #define USE_NM
 #else
@@ -31,23 +34,6 @@
 // clang-format on
 
 
-// Include/detect MPI
-// clang-format off
-#ifndef USE_MPI
-    #ifdef USE_EXT_MPI
-        #define USE_MPI
-    #elif defined(__has_include)
-        #if __has_include("mpi.h")
-            #define USE_MPI
-        #endif
-    #endif
-#endif
-#ifdef USE_MPI
-    #include "mpi.h"
-#endif
-// clang-format on
-
-
 // Include system dependent headers
 // clang-format off
 // Detect the OS and include system dependent headers
@@ -66,7 +52,7 @@
     #include <execinfo.h>
     #include <sched.h>
     #include <sys/time.h>
-    #include <time.h>
+    #include <ctime>
     #include <unistd.h>
     #include <sys/syscall.h>
 #endif
@@ -98,22 +84,26 @@
 
 // Set the callstack signal
 #ifdef SIGRTMIN
-    #define CALLSTACK_SIG SIGRTMIN+4
+#define CALLSTACK_SIG SIGRTMIN + 4
 #else
-    #define CALLSTACK_SIG SIGUSR1
-    #define SIGRTMIN SIGUSR1
-    #define SIGRTMAX SIGUSR1
+#define CALLSTACK_SIG SIGUSR1
+#define SIGRTMIN SIGUSR1
+#define SIGRTMAX SIGUSR1
 #endif
 
 
+// Helper thread
+static std::shared_ptr<std::thread> globalMonitorThread;
+
+
 // Utility to break a string by a newline
-static inline std::vector<std::string> breakString( const std::string& str )
+static inline std::vector<std::string> breakString( const std::string &str )
 {
     std::vector<std::string> strvec;
     size_t i1 = 0;
     size_t i2 = std::min( str.find( '\n', i1 ), str.length() );
     while ( i1 < str.length() ) {
-        strvec.push_back( str.substr( i1, i2-i1 ) );
+        strvec.push_back( str.substr( i1, i2 - i1 ) );
         i1 = i2 + 1;
         i2 = std::min( str.find( '\n', i1 ), str.length() );
     }
@@ -121,12 +111,26 @@ static inline std::vector<std::string> breakString( const std::string& str )
 }
 
 
+// Function to replace all instances of a string with another
+static inline void strrep( std::string &str, const std::string &s, const std::string &r )
+{
+    size_t i = 0;
+    while ( i < str.length() ) {
+        i = str.find( s, i );
+        if ( i == std::string::npos ) {
+            break;
+        }
+        str.replace( i, s.length(), r );
+        i += r.length();
+    }
+}
+
+
 // Utility to strip the path from a filename
 static inline std::string stripPath( const std::string &filename )
 {
-    if ( filename.empty() ) {
+    if ( filename.empty() )
         return std::string();
-    }
     int i = 0;
     for ( i = (int) filename.size() - 1; i >= 0 && filename[i] != 47 && filename[i] != 92; i-- ) {
     }
@@ -166,17 +170,17 @@ BOOL GetModuleListTH32( HANDLE hProcess, DWORD pid );
 BOOL GetModuleListPSAPI( HANDLE hProcess );
 DWORD LoadModule( HANDLE hProcess, LPCSTR img, LPCSTR mod, DWORD64 baseAddr, DWORD size );
 void LoadModules();
-};
+}; // namespace StackTrace
 #endif
 
 
 // Functions to copy data
-static inline char* copy_in( size_t N, const void* data, char *ptr )
+static inline char *copy_in( size_t N, const void *data, char *ptr )
 {
     memcpy( ptr, data, N );
     return ptr + N;
 }
-static inline const char* copy_out( size_t N, void* data, const char *ptr )
+static inline const char *copy_out( size_t N, void *data, const char *ptr )
 {
     memcpy( data, ptr, N );
     return ptr + N;
@@ -184,62 +188,86 @@ static inline const char* copy_out( size_t N, void* data, const char *ptr )
 
 
 /****************************************************************************
-*  Utility to call system command and return output                         *
-****************************************************************************/
+ *  Utility to call system command and return output                         *
+ ****************************************************************************/
 #ifdef USE_WINDOWS
 #define popen _popen
 #define pclose _pclose
 #endif
-std::string StackTrace::exec( const std::string& cmd, int& code )
+std::string StackTrace::exec( const std::string &cmd, int &code )
 {
-    signal( SIGCHLD, SIG_DFL );     // Clear child exited
-    FILE* pipe = popen(cmd.c_str(), "r");
+    signal( SIGCHLD, SIG_DFL ); // Clear child exited
+    FILE *pipe = popen( cmd.c_str(), "r" );
     if ( pipe == nullptr )
         return std::string();
     std::string result = "";
-    result.reserve(1024);    
-    while ( !feof(pipe) ) {
+    result.reserve( 1024 );
+    while ( !feof( pipe ) ) {
         char buffer[257];
         buffer[256] = 0;
-        if ( fgets(buffer, 128, pipe) != NULL )
+        if ( fgets( buffer, 128, pipe ) != nullptr )
             result += buffer;
     }
     auto status = pclose( pipe );
-    code = WEXITSTATUS(status);
+    code        = WEXITSTATUS( status );
     return result;
 }
 
 
 /****************************************************************************
-*  stack_info                                                               *
-****************************************************************************/
-bool StackTrace::stack_info::operator==( const StackTrace::stack_info& rhs ) const
+ *  stack_info                                                               *
+ ****************************************************************************/
+void StackTrace::stack_info::clear()
+{
+    address  = nullptr;
+    address2 = nullptr;
+    object.clear();
+    function.clear();
+    filename.clear();
+    line = -1;
+}
+bool StackTrace::stack_info::operator==( const StackTrace::stack_info &rhs ) const
 {
     if ( address == rhs.address )
         return true;
-    if ( address2==rhs.address2 && object==rhs.object )
+    if ( address2 == rhs.address2 && object == rhs.object )
         return true;
     return false;
 }
-bool StackTrace::stack_info::operator!=( const StackTrace::stack_info& rhs ) const
+bool StackTrace::stack_info::operator!=( const StackTrace::stack_info &rhs ) const
 {
     return !operator==( rhs );
 }
-std::string StackTrace::stack_info::print() const
+int StackTrace::stack_info::getAddressWidth() const
 {
-    char tmp[32];
-    sprintf( tmp, "0x%016llx:  ", reinterpret_cast<unsigned long long int>( address ) );
-    std::string stack( tmp );
-    sprintf( tmp, "%i", line );
-    std::string line_str( tmp );
+    auto addr = reinterpret_cast<unsigned long long int>( address );
+    if ( addr <= 0xFFFF )
+        return 4;
+    if ( addr <= 0xFFFFFFFF )
+        return 8;
+    if ( addr <= 0xFFFFFFFFFFFF )
+        return 12;
+    return 16;
+}
+std::string
+StackTrace::stack_info::print( int widthAddress, int widthObject, int widthFunction ) const
+{
+    char tmp1[64], tmp2[64];
+    sprintf( tmp1, "0x%%0%illx:  ", widthAddress );
+    sprintf( tmp2, tmp1, reinterpret_cast<unsigned long long int>( address ) );
+    std::string stack( tmp2 );
+    sprintf( tmp2, "%i", line );
+    std::string line_str( tmp2 );
+    size_t N = stack.length();
     stack += stripPath( object );
-    stack.resize( std::max<size_t>( stack.size(), 38 ), ' ' );
+    stack.resize( std::max<size_t>( stack.size(), N + widthObject ), ' ' );
+    N = stack.length() + 2;
     stack += "  " + function;
     if ( !filename.empty() && line > 0 ) {
-        stack.resize( std::max<size_t>( stack.size(), 72 ), ' ' );
+        stack.resize( std::max<size_t>( stack.size(), N + widthFunction ), ' ' );
         stack += "  " + stripPath( filename ) + ":" + line_str;
     } else if ( !filename.empty() ) {
-        stack.resize( std::max<size_t>( stack.size(), 72 ), ' ' );
+        stack.resize( std::max<size_t>( stack.size(), N + widthFunction ), ' ' );
         stack += "  " + stripPath( filename );
     } else if ( line > 0 ) {
         stack += " : " + line_str;
@@ -248,164 +276,207 @@ std::string StackTrace::stack_info::print() const
 }
 size_t StackTrace::stack_info::size() const
 {
-    return 2*sizeof(void*) + 4*sizeof(int) + object.size() + function.size() + filename.size();
+    return 2 * sizeof( void * ) + 4 * sizeof( int ) + object.size() + function.size() +
+           filename.size();
 }
-char* StackTrace::stack_info::pack( char* ptr ) const
+char *StackTrace::stack_info::pack( char *ptr ) const
 {
-    int Nobj = object.size();
-    int Nfun = function.size();
+    int Nobj  = object.size();
+    int Nfun  = function.size();
     int Nfile = filename.size();
-    ptr = copy_in( sizeof(void*), &address,  ptr );
-    ptr = copy_in( sizeof(void*), &address2, ptr );
-    ptr = copy_in( sizeof(int), &Nobj,  ptr );
-    ptr = copy_in( sizeof(int), &Nfun,  ptr );
-    ptr = copy_in( sizeof(int), &Nfile, ptr );
-    ptr = copy_in( sizeof(int), &line,  ptr );
-    ptr = copy_in( Nobj,  object.data(),   ptr );
-    ptr = copy_in( Nfun,  function.data(), ptr );
-    ptr = copy_in( Nfile, filename.data(), ptr );
-    return ptr;    
+    ptr       = copy_in( sizeof( void * ), &address, ptr );
+    ptr       = copy_in( sizeof( void * ), &address2, ptr );
+    ptr       = copy_in( sizeof( int ), &Nobj, ptr );
+    ptr       = copy_in( sizeof( int ), &Nfun, ptr );
+    ptr       = copy_in( sizeof( int ), &Nfile, ptr );
+    ptr       = copy_in( sizeof( int ), &line, ptr );
+    ptr       = copy_in( Nobj, object.data(), ptr );
+    ptr       = copy_in( Nfun, function.data(), ptr );
+    ptr       = copy_in( Nfile, filename.data(), ptr );
+    return ptr;
 }
-const char* StackTrace::stack_info::unpack( const char* ptr )
+const char *StackTrace::stack_info::unpack( const char *ptr )
 {
     int Nobj, Nfun, Nfile;
-    ptr = copy_out( sizeof(void*), &address,  ptr );
-    ptr = copy_out( sizeof(void*), &address2, ptr );
-    ptr = copy_out( sizeof(int), &Nobj,  ptr );
-    ptr = copy_out( sizeof(int), &Nfun,  ptr );
-    ptr = copy_out( sizeof(int), &Nfile, ptr );
-    ptr = copy_out( sizeof(int), &line,  ptr );
+    ptr = copy_out( sizeof( void * ), &address, ptr );
+    ptr = copy_out( sizeof( void * ), &address2, ptr );
+    ptr = copy_out( sizeof( int ), &Nobj, ptr );
+    ptr = copy_out( sizeof( int ), &Nfun, ptr );
+    ptr = copy_out( sizeof( int ), &Nfile, ptr );
+    ptr = copy_out( sizeof( int ), &line, ptr );
     object.resize( Nobj );
     function.resize( Nfun );
     filename.resize( Nfile );
-    ptr = copy_out( Nobj,  &object.front(),   ptr );
-    ptr = copy_out( Nfun,  &function.front(), ptr );
+    ptr = copy_out( Nobj, &object.front(), ptr );
+    ptr = copy_out( Nfun, &function.front(), ptr );
     ptr = copy_out( Nfile, &filename.front(), ptr );
-    return ptr; 
+    return ptr;
 }
-std::vector<char> StackTrace::stack_info::packArray( const std::vector<stack_info>& data )
+std::vector<char> StackTrace::stack_info::packArray( const std::vector<stack_info> &data )
 {
-    size_t size = sizeof(int);
-    for (size_t i=0; i<data.size(); i++)
-        size += data[i].size();
-    std::vector<char> vec(size,0);
-    char* ptr = vec.data();
-    int N = data.size();
-    ptr = copy_in( sizeof(int), &N,  ptr );
-    for (size_t i=0; i<data.size(); i++)
-        ptr = data[i].pack( ptr );
+    size_t size = sizeof( int );
+    for ( const auto &i : data )
+        size += i.size();
+    std::vector<char> vec( size, 0 );
+    char *ptr = vec.data();
+    int N     = data.size();
+    ptr       = copy_in( sizeof( int ), &N, ptr );
+    for ( const auto &i : data )
+        ptr = i.pack( ptr );
     return vec;
 }
-std::vector<StackTrace::stack_info> StackTrace::stack_info::unpackArray( const char* ptr )
+std::vector<StackTrace::stack_info> StackTrace::stack_info::unpackArray( const char *ptr )
 {
     int N;
-    ptr = copy_out( sizeof(int), &N, ptr );
-    std::vector<stack_info> data(N);
-    for (size_t i=0; i<data.size(); i++)
-        ptr = data[i].unpack( ptr );
+    ptr = copy_out( sizeof( int ), &N, ptr );
+    std::vector<stack_info> data( N );
+    for ( auto &i : data )
+        ptr = i.unpack( ptr );
     return data;
 }
-static std::vector<char> pack( const std::vector<std::vector<StackTrace::stack_info>>& data )
+#ifdef USE_MPI
+static std::vector<char> pack( const std::vector<std::vector<StackTrace::stack_info>> &data )
 {
-    size_t size = sizeof(int);
-    for (size_t i=0; i<data.size(); i++) {
-        size += sizeof(int);
-        for (size_t j=0; j<data[i].size(); j++)
-            size += data[i][j].size();
+    size_t size = sizeof( int );
+    for ( const auto &i : data ) {
+        size += sizeof( int );
+        for ( size_t j = 0; j < i.size(); j++ )
+            size += i[j].size();
     }
     std::vector<char> out( size, 0 );
-    char* ptr = out.data();
-    int N = data.size();
-    ptr = copy_in( sizeof(int), &N,  ptr );
-    for (int i=0; i<N; i++) {
+    char *ptr = out.data();
+    int N     = data.size();
+    ptr       = copy_in( sizeof( int ), &N, ptr );
+    for ( int i = 0; i < N; i++ ) {
         int M = data[i].size();
-        ptr = copy_in( sizeof(int), &M,  ptr );
-        for (int j=0; j<M; j++)
+        ptr   = copy_in( sizeof( int ), &M, ptr );
+        for ( int j = 0; j < M; j++ )
             ptr = data[i][j].pack( ptr );
     }
     return out;
 }
-static std::vector<std::vector<StackTrace::stack_info>> unpack( const std::vector<char>& in )
+static std::vector<std::vector<StackTrace::stack_info>> unpack( const std::vector<char> &in )
 {
-    const char* ptr = in.data();
+    const char *ptr = in.data();
     int N;
-    ptr = copy_out( sizeof(int), &N, ptr );
+    ptr = copy_out( sizeof( int ), &N, ptr );
     std::vector<std::vector<StackTrace::stack_info>> data( N );
-    for (int i=0; i<N; i++) {
+    for ( int i = 0; i < N; i++ ) {
         int M;
-        ptr = copy_out( sizeof(int), &M, ptr );
+        ptr = copy_out( sizeof( int ), &M, ptr );
         data[i].resize( M );
-        for (int j=0; j<M; j++)
+        for ( int j = 0; j < M; j++ )
             ptr = data[i][j].unpack( ptr );
     }
     return data;
 }
+#endif
 
 
 /****************************************************************************
-*  multi_stack_info                                                         *
-****************************************************************************/
-/*static int maxDepth( const StackTrace::multi_stack_info& stack )
+ *  multi_stack_info                                                         *
+ ****************************************************************************/
+StackTrace::multi_stack_info::multi_stack_info( const std::vector<stack_info> &rhs )
 {
-    int depth = 0;
-    for ( auto child : stack.children )
-        depth = std::max<int>( depth, maxDepth( child ) );
-    return depth+1;
-}*/
-std::vector<std::string> StackTrace::multi_stack_info::print( const std::string& prefix ) const
+    operator=( rhs );
+}
+StackTrace::multi_stack_info &StackTrace::multi_stack_info::
+operator=( const std::vector<stack_info> &rhs )
+{
+    clear();
+    if ( rhs.empty() )
+        return *this;
+    N     = 1;
+    stack = rhs[0];
+    if ( rhs.size() > 1 )
+        add( rhs.size() - 1, &rhs[1] );
+    return *this;
+}
+void StackTrace::multi_stack_info::clear()
+{
+    N = 0;
+    stack.clear();
+    children.clear();
+}
+void StackTrace::multi_stack_info::print2( const std::string &prefix,
+                                           int w[3],
+                                           std::vector<std::string> &text ) const
 {
-    std::vector<std::string> text;
     if ( stack == stack_info() ) {
-        for ( const auto& child : children ) {
-            auto tmp = child.print( );
-            text.insert( text.end(), tmp.begin(), tmp.end() );
-        }
-        return text;
-    }        
-    //auto depth = maxDepth( *this );
-    //std::string line = prefix + "[" + std::to_string( N ) + "] ";
-    //for (auto i=1; i<depth; i++)
-    //    line += "--";
-    //line += stack.print();
-    std::string line = prefix + "[" + std::to_string( N ) + "] " + stack.print();
+        for ( const auto &child : children )
+            child.print2( "", w, text );
+        return;
+    }
+    std::string line = prefix + "[" + std::to_string( N ) + "] " + stack.print( w[0], w[1], w[2] );
     text.push_back( line );
     std::string prefix2 = prefix + "  ";
-    for ( size_t i=0; i<children.size(); i++ ) {
-        const auto& child = children[i];
-        auto tmp = child.print( );
-        for ( size_t j=0; j<tmp.size(); j++ ) {
-            std::string line = prefix2 + tmp[j];
-            if ( children.size()>1 && j>0 && i<children.size()-1 )
+    for ( size_t i = 0; i < children.size(); i++ ) {
+        const auto &child = children[i];
+        std::vector<std::string> text2;
+        child.print2( "", w, text2 );
+        for ( size_t j = 0; j < text2.size(); j++ ) {
+            std::string line = prefix2 + text2[j];
+            if ( children.size() > 1 && j > 0 && i < children.size() - 1 )
                 line[prefix2.size()] = '|';
             text.push_back( line );
         }
     }
+}
+std::vector<std::string> StackTrace::multi_stack_info::print( const std::string &prefix ) const
+{
+    std::vector<std::string> text;
+    int w[3] = { 0 };
+    w[0]     = getAddressWidth();
+    w[1]     = getObjectWidth();
+    w[2]     = getFunctionWidth();
+    print2( prefix, w, text );
     return text;
 }
+int StackTrace::multi_stack_info::getAddressWidth() const
+{
+    int w = stack.getAddressWidth();
+    for ( const auto &child : children )
+        w = std::max( w, child.getAddressWidth() );
+    return w;
+}
+int StackTrace::multi_stack_info::getObjectWidth() const
+{
+    int w = std::min<int>( stripPath( stack.object ).size() + 1, 20 );
+    for ( const auto &child : children )
+        w = std::max( w, child.getObjectWidth() );
+    return w;
+}
+int StackTrace::multi_stack_info::getFunctionWidth() const
+{
+    int w = std::min<int>( stack.function.size() + 1, 40 );
+    for ( const auto &child : children )
+        w = std::max( w, child.getFunctionWidth() );
+    return w;
+}
 void StackTrace::multi_stack_info::add( size_t len, const stack_info *stack )
 {
     if ( len == 0 )
         return;
-    const auto& s = stack[len-1];
-    for ( size_t i=0; i<children.size(); i++) {
-        if ( children[i].stack == s ) {
-            children[i].N++;
+    const auto &s = stack[len - 1];
+    for ( auto &i : children ) {
+        if ( i.stack == s ) {
+            i.N++;
             if ( len > 1 )
-                children[i].add( len-1, stack );
+                i.add( len - 1, stack );
             return;
         }
     }
-    children.resize( children.size()+1 );
-    children.back().N = 1;
+    children.resize( children.size() + 1 );
+    children.back().N     = 1;
     children.back().stack = s;
     if ( len > 1 )
-        children.back().add( len-1, stack );
+        children.back().add( len - 1, stack );
 }
 
 
 /****************************************************************************
-*  Function to find an entry                                                *
-****************************************************************************/
+ *  Function to find an entry                                                *
+ ****************************************************************************/
 template <class TYPE>
 inline size_t findfirst( const std::vector<TYPE> &X, TYPE Y )
 {
@@ -429,28 +500,18 @@ inline size_t findfirst( const std::vector<TYPE> &X, TYPE Y )
 
 
 /****************************************************************************
-* Function to get symbols for the executable from nm (if availible)         *
-* Note: this function maintains an internal cached copy to prevent          *
-*    exccessive calls to nm.  This function also uses a lock to ensure      *
-*    thread safety.                                                         *
-****************************************************************************/
-std::mutex getSymbols_mutex;
-struct global_symbols_struct {
-    std::vector<void *> address;
-    std::vector<char> type;
-    std::vector<std::string> obj;
-    int error;
-} global_symbols;
-std::string StackTrace::getExecutable()
+ *  Function to get the executable name                                      *
+ ****************************************************************************/
+static char global_exe_name[1000] = { 0 };
+static bool setGlobalExecutableName( char *exe )
 {
-    std::string exe;
     try {
 #ifdef USE_LINUX
-        char *buf = new char[0x10000];
+        auto *buf = new char[0x10000];
         int len   = ::readlink( "/proc/self/exe", buf, 0x10000 );
         if ( len != -1 ) {
             buf[len] = '\0';
-            exe      = std::string( buf );
+            strcpy( exe, buf );
         }
         delete[] buf;
 #elif defined( USE_MAC )
@@ -458,21 +519,42 @@ std::string StackTrace::getExecutable()
         char *buf     = new char[size];
         memset( buf, 0, size );
         if ( _NSGetExecutablePath( buf, &size ) == 0 )
-            exe = std::string( buf );
+            strcpy( exe, buf );
         delete[] buf;
 #elif defined( USE_WINDOWS )
         DWORD size = 0x10000;
         char *buf  = new char[size];
         memset( buf, 0, size );
         GetModuleFileName( nullptr, buf, size );
-        exe = std::string( buf );
+        strcpy( exe, buf );
         delete[] buf;
 #endif
     } catch ( ... ) {
     }
-    return exe;
+    return true;
 }
-std::string global_exe_name = StackTrace::getExecutable();
+static bool global_exe_name_set = setGlobalExecutableName( global_exe_name );
+std::string StackTrace::getExecutable()
+{
+    if ( !global_exe_name_set )
+        global_exe_name_set = setGlobalExecutableName( global_exe_name );
+    return std::string( global_exe_name );
+}
+
+
+/****************************************************************************
+ * Function to get symbols for the executable from nm (if availible)         *
+ * Note: this function maintains an internal cached copy to prevent          *
+ *    exccessive calls to nm.  This function also uses a lock to ensure      *
+ *    thread safety.                                                         *
+ ****************************************************************************/
+std::mutex getSymbols_mutex;
+struct global_symbols_struct {
+    std::vector<void *> address;
+    std::vector<char> type;
+    std::vector<std::string> obj;
+    int error;
+} global_symbols;
 static const global_symbols_struct &getSymbols2()
 {
     static bool loaded = false;
@@ -486,20 +568,20 @@ static const global_symbols_struct &getSymbols2()
             try {
                 char cmd[1024];
 #ifdef USE_LINUX
-                sprintf( cmd, "nm -n --demangle %s", global_exe_name.c_str() );
+                sprintf( cmd, "nm -n --demangle %s", global_exe_name );
 #elif defined( USE_MAC )
-                sprintf( cmd, "nm -n %s | c++filt", global_exe_name.c_str() );
+                sprintf( cmd, "nm -n %s | c++filt", global_exe_name );
 #else
 #error Unknown OS using nm
 #endif
                 int code;
                 auto output = breakString( StackTrace::exec( cmd, code ) );
-                for ( const auto& line : output ) {
+                for ( const auto &line : output ) {
                     if ( line.empty() )
                         continue;
                     if ( line[0] == ' ' )
                         continue;
-                    char *a = const_cast<char*>(line.c_str());
+                    auto *a = const_cast<char *>( line.c_str() );
                     char *b = strchr( a, ' ' );
                     if ( b == nullptr )
                         continue;
@@ -512,11 +594,11 @@ static const global_symbols_struct &getSymbols2()
                     c++;
                     char *d = strchr( c, '\n' );
                     if ( d )
-                        d[0]   = 0;
+                        d[0] = 0;
                     size_t add = strtoul( a, nullptr, 16 );
                     data.address.push_back( reinterpret_cast<void *>( add ) );
                     data.type.push_back( b[0] );
-                    data.obj.push_back( std::string( c ) );
+                    data.obj.emplace_back( c );
                 }
             } catch ( ... ) {
                 data.error = -3;
@@ -530,8 +612,9 @@ static const global_symbols_struct &getSymbols2()
     }
     return data;
 }
-int StackTrace::getSymbols(
-    std::vector<void *> &address, std::vector<char> &type, std::vector<std::string> &obj )
+int StackTrace::getSymbols( std::vector<void *> &address,
+                            std::vector<char> &type,
+                            std::vector<std::string> &obj )
 {
     const global_symbols_struct &data = getSymbols2();
     address                           = data.address;
@@ -542,12 +625,12 @@ int StackTrace::getSymbols(
 
 
 /****************************************************************************
-*  Function to get call stack info                                          *
-****************************************************************************/
+ *  Function to get call stack info                                          *
+ ****************************************************************************/
 #ifdef USE_MAC
-static void *loadAddress( const std::string& object )
+static void *loadAddress( const std::string &object )
 {
-    static std::map<std::string,void*> obj_map;
+    static std::map<std::string, void *> obj_map;
     if ( obj_map.empty() ) {
         uint32_t numImages = _dyld_image_count();
         for ( uint32_t i = 0; i < numImages; i++ ) {
@@ -603,19 +686,21 @@ static std::tuple<std::string, std::string, std::string, int> split_atos( const
 }
 #endif
 #ifdef USE_LINUX
-    typedef uint64_t uint_p;
-#elif defined(USE_MAC)
-    typedef unsigned long uint_p;
+using uint_p = uint64_t;
+#elif defined( USE_MAC )
+typedef unsigned long uint_p;
 #endif
 #if defined( USE_LINUX ) || defined( USE_MAC )
-static inline std::string generateCmd( const std::string& s1,
-    const std::string& s2, const std::string& s3,
-    std::vector<void*> addresses, const std::string& s4 )
+static inline std::string generateCmd( const std::string &s1,
+                                       const std::string &s2,
+                                       const std::string &s3,
+                                       std::vector<void *> addresses,
+                                       const std::string &s4 )
 {
     std::string cmd = s1 + s2 + s3;
-    for (size_t i=0; i<addresses.size(); i++) {
+    for ( auto &addresse : addresses ) {
         char tmp[32];
-        sprintf( tmp, "%lx ", reinterpret_cast<uint_p>( addresses[i] ) );
+        sprintf( tmp, "%lx ", reinterpret_cast<uint_p>( addresse ) );
         cmd += tmp;
     }
     cmd += s4;
@@ -635,6 +720,8 @@ static void getFileAndLineObject( std::vector<StackTrace::stack_info*> &info )
             address_list[i] = info[i]->address;
             if ( info[i]->object.find( ".so" ) != std::string::npos )
                 address_list[i] = info[i]->address2; 
+            if ( info[i]->object.find( ".mexa64" ) != std::string::npos )
+                address_list[i] = info[i]->address2; 
         }
         std::string cmd = generateCmd( "addr2line -C -e ", info[0]->object,
             " -f -i ", address_list, " 2> /dev/null" );
@@ -696,9 +783,9 @@ static void getFileAndLine( std::vector<StackTrace::stack_info> &info )
 {
     // Build a list of stack elements for each object
     std::map<std::string,std::vector<StackTrace::stack_info*>> obj_map;
-    for (size_t i=0; i<info.size(); i++) {
-        auto& list = obj_map[info[i].object];
-        list.emplace_back( &info[i] );
+    for (auto & i : info) {
+        auto& list = obj_map[i.object];
+        list.emplace_back( &i );
     }
     // For each object, get the file/line numbers for all entries
     for ( auto& entry : obj_map ) 
@@ -713,7 +800,7 @@ static void getDataFromGlobalSymbols( StackTrace::stack_info &info )
         if ( index > 0 )
             info.object = global_symbols.obj[index - 1];
         else
-            info.object = global_exe_name;
+            info.object = std::string(global_exe_name);
     }
 }
 static void signal_handler( int sig )
@@ -799,10 +886,9 @@ std::vector<StackTrace::stack_info> StackTrace::getStackInfo( const std::vector<
                             info[i].function = std::string( dlinfo.dli_sname );
                         }
                         free( demangled );
-                    #else
-                        if ( dlinfo.dli_sname != NULL )
-                            info[i].function = std::string( dlinfo.dli_sname );
                     #endif
+                    if ( dlinfo.dli_sname != nullptr && info[i].function.empty() )
+                        info[i].function = std::string( dlinfo.dli_sname );
                 #else
                     getDataFromGlobalSymbols( info[i] );
                 #endif
@@ -820,25 +906,23 @@ std::vector<StackTrace::stack_info> StackTrace::getStackInfo( const std::vector<
 /****************************************************************************
 *  Function to get the backtrace                                            *
 ****************************************************************************/
+static int backtrace_thread( const std::thread::native_handle_type&, void**, size_t );
 #if defined( USE_LINUX ) || defined( USE_MAC )
-static std::vector<void*> thread_backtrace;
-static bool thread_backtrace_finished;
+static int thread_backtrace_count;
+static void* thread_backtrace[1000];
 static std::mutex thread_backtrace_mutex;
 static void _callstack_signal_handler( int, siginfo_t*, void* )
 {
-    thread_backtrace = StackTrace::backtrace( );
-    thread_backtrace_finished = true;
+    thread_backtrace_count = backtrace_thread( StackTrace::thisThread(), thread_backtrace, 1000 );
 }
 #endif
-std::vector<void*> StackTrace::backtrace( std::thread::native_handle_type tid )
+static int backtrace_thread( const std::thread::native_handle_type& tid, void **buffer, size_t size )
 {
-    std::vector<void*> trace;
+    int count = 0;
     #if defined( USE_LINUX ) || defined( USE_MAC )
         // Get the trace
         if ( tid == pthread_self() ) {
-            trace.resize(1000,nullptr);
-            int trace_size = ::backtrace( trace.data(), trace.size() );
-            trace.resize (trace_size );
+            count = ::backtrace( buffer, size );
         } else {
             // Note: this will get the backtrace, but terminates the thread in the process!!!
             thread_backtrace_mutex.lock();
@@ -846,17 +930,18 @@ std::vector<void*> StackTrace::backtrace( std::thread::native_handle_type tid )
             sigfillset(&sa.sa_mask);
             sa.sa_flags = SA_SIGINFO;
             sa.sa_sigaction = _callstack_signal_handler;
-            sigaction(CALLSTACK_SIG, &sa, NULL);
-            thread_backtrace_finished = false;
+            sigaction(CALLSTACK_SIG, &sa, nullptr);
+            thread_backtrace_count = -1;
             pthread_kill( tid, CALLSTACK_SIG );
             auto t1 = std::chrono::high_resolution_clock::now();
             auto t2 = std::chrono::high_resolution_clock::now();
-            while ( !thread_backtrace_finished && std::chrono::duration<double>(t2-t1).count()<0.1 ) {
+            while ( thread_backtrace_count==-1 && std::chrono::duration<double>(t2-t1).count()<0.15 ) {
                 std::this_thread::yield();
                 t2 = std::chrono::high_resolution_clock::now();
             }
-            std::swap( trace, thread_backtrace );
-            thread_backtrace_finished = false;
+            count = std::max(thread_backtrace_count,0);
+            memcpy( buffer, thread_backtrace, count*sizeof(void*) );
+            thread_backtrace_count = -1;
             thread_backtrace_mutex.unlock();
         }
     #elif defined( USE_WINDOWS )
@@ -902,7 +987,6 @@ std::vector<void*> StackTrace::backtrace( std::thread::native_handle_type tid )
                 #error "Platform not supported!"
             #endif
 
-            trace.reserve( 1000 );
             auto pid = GetCurrentProcess();
             for ( int frameNum = 0; frameNum<1024; ++frameNum ) {
                 BOOL rtn = StackWalk64( imageType, pid, tid, &frame, &context, readProcMem,
@@ -911,10 +995,10 @@ std::vector<void*> StackTrace::backtrace( std::thread::native_handle_type tid )
                     printf( "ERROR: StackWalk64 (%p)\n", frame.AddrPC.Offset );
                     break;
                 }
-
-                if ( frame.AddrPC.Offset != 0 )
-                    trace.push_back( reinterpret_cast<void*>( frame.AddrPC.Offset ) );
-
+                if ( frame.AddrPC.Offset != 0 ) {
+                    buffer[count] = reinterpret_cast<void*>( frame.AddrPC.Offset ) );
+                    count++;
+                }
                 if ( frame.AddrReturn.Offset == 0 )
                     break;
             }
@@ -923,11 +1007,20 @@ std::vector<void*> StackTrace::backtrace( std::thread::native_handle_type tid )
     #else
         #warning Stack trace is not supported on this compiler/OS
     #endif
+    return count;
+}
+std::vector<void*> StackTrace::backtrace( std::thread::native_handle_type tid )
+{
+    std::vector<void*> trace( 1000, nullptr );
+    size_t count = backtrace_thread( tid, trace.data(), trace.size() );
+    trace.resize(count);
     return trace;
 }
 std::vector<void*> StackTrace::backtrace()
 {
-    std::vector<void*> trace = backtrace( thisThread() );
+    std::vector<void*> trace( 1000, nullptr );
+    size_t count = backtrace_thread( thisThread(), trace.data(), trace.size() );
+    trace.resize(count);
     return trace;
 }
 std::vector<std::vector<void *>> StackTrace::backtraceAll()
@@ -935,10 +1028,14 @@ std::vector<std::vector<void *>> StackTrace::backtraceAll()
     // Get the list of threads
     auto threads = activeThreads( );
     // Get the backtrace of each thread
-    std::vector<std::vector<void*>> thread_backtrace;
-    for ( auto thread : threads )
-        thread_backtrace.push_back( backtrace( thread ) );
-    return thread_backtrace;
+    std::vector<std::vector<void*>> trace(threads.size());
+    size_t i = 0;
+    for ( auto it=threads.begin(); i<threads.size(); i++, it++ ) {
+        trace[i].resize(1000);
+        size_t count = backtrace_thread( *it, trace[i].data(), trace[i].size() );
+        trace[i].resize(count);
+    }
+    return trace;
 }
 
 
@@ -947,15 +1044,16 @@ std::vector<std::vector<void *>> StackTrace::backtraceAll()
 ****************************************************************************/
 #if defined( USE_LINUX )
 static std::thread::native_handle_type thread_handle;
+static bool thread_id_finished;
 static void _activeThreads_signal_handler( int )
 {
     auto handle = StackTrace::thisThread( );
     thread_handle = handle;
-    thread_backtrace_finished = true;
+    thread_id_finished = true;
 }
 static inline int get_tid( int pid, const std::string& line )
 {
-    char buf2[128];
+    char buf2[128]={0};
     int i1 = 0;
     while ( line[i1]==' ' && line[i1]!=0 ) { i1++; }
     int i2 = i1;
@@ -1006,12 +1104,12 @@ std::set<std::thread::native_handle_type> StackTrace::activeThreads( )
         signal( CALLSTACK_SIG, _activeThreads_signal_handler );
         for ( auto tid2 : tid ) {
             thread_backtrace_mutex.lock();
-            thread_backtrace_finished = false;
+            thread_id_finished = false;
             thread_handle = thisThread();
             syscall( SYS_tgkill, pid, tid2, CALLSTACK_SIG );
             auto t1 = std::chrono::high_resolution_clock::now();
             auto t2 = std::chrono::high_resolution_clock::now();
-            while ( !thread_backtrace_finished && std::chrono::duration<double>(t2-t1).count()<0.1 ) {
+            while ( !thread_id_finished && std::chrono::duration<double>(t2-t1).count()<0.1 ) {
                 std::this_thread::yield();
                 t2 = std::chrono::high_resolution_clock::now();
             }
@@ -1043,54 +1141,57 @@ std::set<std::thread::native_handle_type> StackTrace::activeThreads( )
         #warning activeThreads is not yet supported on this compiler/OS
     #endif
     threads.insert( thisThread() );
+    if ( globalMonitorThread )
+        threads.erase( globalMonitorThread->native_handle() );
     return threads;
 }
 // clang-format on
 
 
 /****************************************************************************
-*  Function to get the current call stack                                   *
-****************************************************************************/
+ *  Function to get the current call stack                                   *
+ ****************************************************************************/
 std::vector<StackTrace::stack_info> StackTrace::getCallStack()
 {
     auto trace = StackTrace::backtrace();
-    auto info = getStackInfo(trace);
+    auto info  = getStackInfo( trace );
     return info;
 }
 std::vector<StackTrace::stack_info> StackTrace::getCallStack( std::thread::native_handle_type id )
 {
     auto trace = StackTrace::backtrace( id );
-    auto info = getStackInfo(trace);
+    auto info  = getStackInfo( trace );
     return info;
 }
-static StackTrace::multi_stack_info generateMultiStack( const std::vector<std::vector<void*>>& thread_backtrace )
+static StackTrace::multi_stack_info
+generateMultiStack( const std::vector<std::vector<void *>> &thread_backtrace )
 {
     // Get the stack data for all pointers
-    std::set<void*> addresses_set;
-    for (const auto& trace : thread_backtrace ) {
-        for (auto ptr : trace )
+    std::set<void *> addresses_set;
+    for ( const auto &trace : thread_backtrace ) {
+        for ( auto ptr : trace )
             addresses_set.insert( ptr );
     }
-    std::vector<void*> addresses( addresses_set.begin(), addresses_set.end() );
+    std::vector<void *> addresses( addresses_set.begin(), addresses_set.end() );
     auto stack_data = StackTrace::getStackInfo( addresses );
-    std::map<void*,StackTrace::stack_info> map_data;
-    for ( size_t i=0; i<addresses.size(); i++)
+    std::map<void *, StackTrace::stack_info> map_data;
+    for ( size_t i = 0; i < addresses.size(); i++ )
         map_data.insert( std::make_pair( addresses[i], stack_data[i] ) );
     // Create the multi-stack trace
     StackTrace::multi_stack_info multistack;
-    for ( const auto& trace : thread_backtrace ) {
+    for ( const auto &trace : thread_backtrace ) {
         if ( trace.empty() )
             continue;
         // Create the stack for the given thread trace
         std::vector<StackTrace::stack_info> stack( trace.size() );
-        for (size_t i=0; i<trace.size(); i++)
+        for ( size_t i = 0; i < trace.size(); i++ )
             stack[i] = map_data[trace[i]];
         // Add the data to the multistack
         multistack.add( stack.size(), stack.data() );
     }
     return multistack;
 }
-StackTrace::multi_stack_info StackTrace::getAllCallStacks( )
+StackTrace::multi_stack_info StackTrace::getAllCallStacks()
 {
     // Get the backtrace of each thread
     auto thread_backtrace = backtraceAll();
@@ -1100,10 +1201,9 @@ StackTrace::multi_stack_info StackTrace::getAllCallStacks( )
 }
 
 
-
 /****************************************************************************
-*  Function to get system search paths                                      *
-****************************************************************************/
+ *  Function to get system search paths                                      *
+ ****************************************************************************/
 std::string StackTrace::getSymPaths()
 {
     std::string paths;
@@ -1160,8 +1260,8 @@ std::string StackTrace::getSymPaths()
 
 
 /****************************************************************************
-*  Load modules for windows                                                 *
-****************************************************************************/
+ *  Load modules for windows                                                 *
+ ****************************************************************************/
 #ifdef USE_WINDOWS
 BOOL StackTrace::GetModuleListTH32( HANDLE hProcess, DWORD pid )
 {
@@ -1331,18 +1431,15 @@ void StackTrace::LoadModules()
 
 
 /****************************************************************************
-*  Get the signal name                                                      *
-****************************************************************************/
-std::string StackTrace::signalName( int sig )
-{
-    return std::string( strsignal(sig) );
-}
+ *  Get the signal name                                                      *
+ ****************************************************************************/
+std::string StackTrace::signalName( int sig ) { return std::string( strsignal( sig ) ); }
 std::vector<int> StackTrace::allSignalsToCatch()
 {
     std::set<int> signals;
-    for (int i=1; i<32; i++)
+    for ( int i = 1; i < 32; i++ )
         signals.insert( i );
-    for (int i=SIGRTMIN; i<=SIGRTMAX; i++)
+    for ( int i = SIGRTMIN; i <= SIGRTMAX; i++ )
         signals.insert( i );
     signals.erase( SIGKILL );
     signals.erase( SIGSTOP );
@@ -1352,15 +1449,15 @@ std::vector<int> StackTrace::defaultSignalsToCatch()
 {
     auto tmp = allSignalsToCatch();
     std::set<int> signals( tmp.begin(), tmp.end() );
-    signals.erase( SIGWINCH );  // Don't catch window changed by default
-    signals.erase( SIGCONT );   // Don't catch continue by default
+    signals.erase( SIGWINCH ); // Don't catch window changed by default
+    signals.erase( SIGCONT );  // Don't catch continue by default
     return std::vector<int>( signals.begin(), signals.end() );
 }
 
 
 /****************************************************************************
-*  Set the signal handlers                                                  *
-****************************************************************************/
+ *  Set the signal handlers                                                  *
+ ****************************************************************************/
 static std::function<void( std::string, StackTrace::terminateType )> abort_fun;
 static std::string rethrow()
 {
@@ -1398,7 +1495,7 @@ static void term_func()
 }
 void StackTrace::clearSignal( int sig )
 {
-    if ( signals_set.find(sig) != signals_set.end() ) {
+    if ( signals_set.find( sig ) != signals_set.end() ) {
         signal( sig, SIG_DFL );
         signals_set.erase( sig );
     }
@@ -1409,7 +1506,7 @@ void StackTrace::clearSignals()
         signal( sig, SIG_DFL );
     signals_set.clear();
 }
-void StackTrace::setSignals( const std::vector<int>& signals, void (*handler) (int) )
+void StackTrace::setSignals( const std::vector<int> &signals, void ( *handler )( int ) )
 {
     for ( auto sig : signals ) {
         signal( sig, handler );
@@ -1427,12 +1524,11 @@ void StackTrace::setErrorHandlers(
 
 
 /****************************************************************************
-*  Global call stack functionallity                                         *
-****************************************************************************/
+ *  Global call stack functionallity                                         *
+ ****************************************************************************/
 #ifdef USE_MPI
 static MPI_Comm globalCommForGlobalCommStack = MPI_COMM_NULL;
-static std::shared_ptr<std::thread> globalMonitorThread;
-static bool stopGlobalMonitorThread = false;
+static bool stopGlobalMonitorThread          = false;
 static void runGlobalMonitorThread()
 {
     int rank = 0;
@@ -1445,7 +1541,7 @@ static void runGlobalMonitorThread()
         MPI_Status status;
         int err = MPI_Iprobe( MPI_ANY_SOURCE, 1, globalCommForGlobalCommStack, &flag, &status );
         if ( err != MPI_SUCCESS ) {
-            printf("Internal error in StackTrace::getGlobalCallStacks::runGlobalMonitorThread\n");
+            printf( "Internal error in StackTrace::getGlobalCallStacks::runGlobalMonitorThread\n" );
             break;
         } else if ( flag != 0 ) {
             // We received a request
@@ -1453,8 +1549,8 @@ static void runGlobalMonitorThread()
             int tag;
             MPI_Recv( &tag, 1, MPI_INT, src_rank, 1, globalCommForGlobalCommStack, &status );
             // Get a trace of all threads (except this)
-            auto threads = StackTrace::activeThreads( );
-            threads.erase( StackTrace::thisThread( ) );
+            auto threads = StackTrace::activeThreads();
+            threads.erase( StackTrace::thisThread() );
             if ( threads.empty() )
                 continue;
             // Get the stack trace of each thread
@@ -1467,38 +1563,52 @@ static void runGlobalMonitorThread()
             MPI_Send( data.data(), count, MPI_CHAR, src_rank, tag, globalCommForGlobalCommStack );
         } else {
             // No requests recieved
-            std::this_thread::sleep_for( std::chrono::milliseconds(50) );
+            std::this_thread::sleep_for( std::chrono::milliseconds( 50 ) );
         }
     }
 }
 void StackTrace::globalCallStackInitialize( MPI_Comm comm )
 {
-    #ifdef USE_MPI
-        MPI_Comm_dup( comm, &globalCommForGlobalCommStack );
-    #endif
+#ifdef USE_MPI
+    MPI_Comm_dup( comm, &globalCommForGlobalCommStack );
+#endif
     stopGlobalMonitorThread = false;
     globalMonitorThread.reset( new std::thread( runGlobalMonitorThread ) );
 }
-void StackTrace::globalCallStackFinalize( )
+void StackTrace::globalCallStackFinalize()
 {
     stopGlobalMonitorThread = true;
     globalMonitorThread->join();
     globalMonitorThread.reset();
-    #ifdef USE_MPI
-        if ( globalCommForGlobalCommStack )
-            MPI_Comm_free( &globalCommForGlobalCommStack );
-    #endif
+#ifdef USE_MPI
+    if ( globalCommForGlobalCommStack != MPI_COMM_NULL )
+        MPI_Comm_free( &globalCommForGlobalCommStack );
+    globalCommForGlobalCommStack = MPI_COMM_NULL;
+#endif
 }
-StackTrace::multi_stack_info StackTrace::getGlobalCallStacks( )
+StackTrace::multi_stack_info StackTrace::getGlobalCallStacks()
 {
     // Check if we properly initialized the comm
     if ( globalMonitorThread == nullptr ) {
-        printf("Warning: getGlobalCallStacks called without call to globalCallStackInitialize\n");
-        return getAllCallStacks( );
+        printf( "Warning: getGlobalCallStacks called without call to globalCallStackInitialize\n" );
+        return getAllCallStacks();
     }
-    if ( activeThreads().size()==1 ) {
-        printf("Warning: getAllCallStacks not supported on this OS, defaulting to basic call stack\n");
-        return getAllCallStacks( );
+    if ( globalMonitorThread == nullptr ) {
+        printf( "Warning: getGlobalCallStacks called without call to globalCallStackInitialize\n" );
+        return getAllCallStacks();
+    }
+#ifdef USE_MPI
+    int provided;
+    MPI_Query_thread( &provided );
+    if ( provided != MPI_THREAD_MULTIPLE ) {
+        printf( "Warning: getGlobalCallStacks requires support for MPI_THREAD_MULTIPLE\n" );
+        return getAllCallStacks();
+    }
+#endif
+    if ( activeThreads().size() == 1 ) {
+        printf( "Warning: getAllCallStacks not supported on this OS, defaulting to basic call "
+                "stack\n" );
+        return getAllCallStacks();
     }
     // Signal all processes that we want their stack for all threads
     int rank = 0;
@@ -1506,34 +1616,33 @@ StackTrace::multi_stack_info StackTrace::getGlobalCallStacks( )
     MPI_Comm_size( globalCommForGlobalCommStack, &size );
     MPI_Comm_rank( globalCommForGlobalCommStack, &rank );
     std::random_device rd;
-    std::mt19937 gen(rd());
-    std::uniform_int_distribution<> dis(2,0x7FFF);
-    int tag = dis(gen);
+    std::mt19937 gen( rd() );
+    std::uniform_int_distribution<> dis( 2, 0x7FFF );
+    int tag = dis( gen );
     std::vector<MPI_Request> sendRequest( size );
-    for (int i=0; i<size; i++) {
+    for ( int i = 0; i < size; i++ ) {
         if ( i == rank )
             continue;
         MPI_Isend( &tag, 1, MPI_INT, i, 1, globalCommForGlobalCommStack, &sendRequest[i] );
     }
     // Get the trace for the current process
-    auto threads = StackTrace::activeThreads( );
-    threads.erase( globalMonitorThread->native_handle() );
+    auto threads = StackTrace::activeThreads();
     StackTrace::multi_stack_info multistack;
     for ( auto thread : threads ) {
         auto stack = StackTrace::getCallStack( thread );
         multistack.add( stack.size(), stack.data() );
     }
     // Recieve the backtrace for all processes/threads
-    int N_finished = 1;
-    auto start = std::chrono::steady_clock::now();
-    double time = 0;
-    const double max_time = 2.0 + size*20e-3;
-    while ( N_finished<size && time<max_time ) {
+    int N_finished        = 1;
+    auto start            = std::chrono::steady_clock::now();
+    double time           = 0;
+    const double max_time = 2.0 + size * 20e-3;
+    while ( N_finished < size && time < max_time ) {
         int flag = 0;
         MPI_Status status;
         int err = MPI_Iprobe( MPI_ANY_SOURCE, tag, globalCommForGlobalCommStack, &flag, &status );
         if ( err != MPI_SUCCESS ) {
-            printf("Internal error in StackTrace::getGlobalCallStacks\n");
+            printf( "Internal error in StackTrace::getGlobalCallStacks\n" );
             break;
         } else if ( flag != 0 ) {
             // We recieved a response
@@ -1541,29 +1650,227 @@ StackTrace::multi_stack_info StackTrace::getGlobalCallStacks( )
             int count;
             MPI_Get_count( &status, MPI_CHAR, &count );
             std::vector<char> data( count, 0 );
-            MPI_Recv( data.data(), count, MPI_CHAR, src_rank, tag, globalCommForGlobalCommStack, &status );
+            MPI_Recv( data.data(),
+                      count,
+                      MPI_CHAR,
+                      src_rank,
+                      tag,
+                      globalCommForGlobalCommStack,
+                      &status );
             auto stack_list = unpack( data );
-            for ( const auto& stack : stack_list )
+            for ( const auto &stack : stack_list )
                 multistack.add( stack.size(), stack.data() );
             N_finished++;
         } else {
             auto stop = std::chrono::steady_clock::now();
-            time = std::chrono::duration_cast<std::chrono::seconds>(stop-start).count();
+            time      = std::chrono::duration_cast<std::chrono::seconds>( stop - start ).count();
             std::this_thread::yield();
         }
     }
+    for ( int i = 0; i < size; i++ ) {
+        if ( i == rank )
+            continue;
+        MPI_Request_free( &sendRequest[i] );
+    }
     return multistack;
 }
 #else
-void StackTrace::globalCallStackInitialize( MPI_Comm )
-{
-}
-void StackTrace::globalCallStackFinalize( )
-{
-}
-StackTrace::multi_stack_info StackTrace::getGlobalCallStacks( )
-{
-    return getAllCallStacks( );
-}
+void StackTrace::globalCallStackInitialize( MPI_Comm ) {}
+void StackTrace::globalCallStackFinalize() {}
+StackTrace::multi_stack_info StackTrace::getGlobalCallStacks() { return getAllCallStacks(); }
 #endif
 
+
+/****************************************************************************
+ *  Cleanup the call stack                                                   *
+ ****************************************************************************/
+static inline size_t findMatching( const std::string &str, size_t pos )
+{
+    if ( str[pos] != '<' ) {
+        perr << "Internal error string matching\n";
+        perr << "   " << str << std::endl;
+        perr << "   " << pos << std::endl;
+        return pos;
+    }
+    size_t pos2 = pos + 1;
+    int count   = 1;
+    while ( count != 0 && pos2 < str.size() ) {
+        if ( str[pos2] == '<' )
+            count++;
+        if ( str[pos2] == '>' )
+            count--;
+        pos2++;
+    }
+    return pos2;
+}
+void StackTrace::cleanupStackTrace( multi_stack_info &stack )
+{
+    auto it           = stack.children.begin();
+    const size_t npos = std::string::npos;
+    while ( it != stack.children.end() ) {
+        auto &object      = it->stack.object;
+        auto &function    = it->stack.function;
+        auto &filename    = it->stack.filename;
+        bool remove_entry = false;
+        // Cleanup object and filename
+        object   = stripPath( object );
+        filename = stripPath( filename );
+        // Remove callstack (and all children) for threads that are just contributing
+        if ( function.find( "_callstack_signal_handler" ) != npos &&
+             filename.find( "StackTrace.cpp" ) != npos ) {
+            it = stack.children.erase( it );
+            continue;
+        }
+        // Remove __libc_start_main
+        if ( function.find( "__libc_start_main" ) != npos &&
+             filename.find( "libc-start.c" ) != npos )
+            remove_entry = true;
+        // Remove backtrace_thread
+        if ( function.find( "backtrace_thread" ) != npos &&
+             filename.find( "StackTrace.cpp" ) != npos )
+            remove_entry = true;
+        // Remove __restore_rt
+        if ( function.find( "__restore_rt" ) != npos && object.find( "libpthread" ) != npos )
+            remove_entry = true;
+        // Remove std::condition_variable::__wait_until_impl
+        if ( function.find( "std::condition_variable::__wait_until_impl" ) != npos &&
+             filename == "condition_variable" )
+            remove_entry = true;
+        // Remove std::_Function_handler<
+        if ( function.find( "std::_Function_handler<" ) != npos && filename == "functional" )
+            remove_entry = true;
+        // Remove std::_Bind_simple<
+        if ( function.find( "std::_Bind_simple<" ) != npos && filename == "functional" ) {
+            auto pos     = function.find( "std::_Bind_simple<" );
+            function     = function.substr( 0, pos ) + "std::_Bind_simple<...>(...)";
+            remove_entry = true;
+        }
+        // Remove std::this_thread::__sleep_for
+        if ( function.find( "std::this_thread::__sleep_for(" ) != npos &&
+             object.find( "libstdc++" ) != npos )
+            remove_entry = true;
+        // Remove std::thread::_Impl
+        if ( function.find( "std::thread::_Impl<" ) != npos && filename == "thread" )
+            remove_entry = true;
+        // Remove MATLAB internal routines
+        if ( object == "libmwmcr.so" || object == "libmwm_lxe.so" || object == "libmwbridge.so" ||
+             object == "libmwiqm.so" )
+            remove_entry = true;
+        // Remove the desired entry
+        if ( remove_entry ) {
+            if ( it->children.empty() ) {
+                it = stack.children.erase( it );
+                continue;
+            } else if ( it->children.size() == 1 ) {
+                *it = it->children[0];
+                continue;
+            }
+        }
+        // Cleanup template space
+        strrep( function, " >", ">" );
+        strrep( function, "< ", "<" );
+        // Replace std::chrono::duration with abbriviated version
+        if ( function.find( "std::chrono::duration<" ) != npos ) {
+            strrep( function, "std::chrono::duration<long, std::ratio<1l, 1l> >", "ticks" );
+            strrep( function,
+                    "std::chrono::duration<long, std::ratio<1l, 1000000000l> >",
+                    "nanoseconds" );
+        }
+        // Replace std::ratio with abbriviated version.
+        if ( function.find( "std::ratio<" ) != npos ) {
+            strrep( function, "std::ratio<1l, 1000000000000000000000000l>", "std::yocto" );
+            strrep( function, "std::ratio<1l, 1000000000000000000000l>", "std::zepto" );
+            strrep( function, "std::ratio<1l, 1000000000000000000l>", "std::atto" );
+            strrep( function, "std::ratio<1l, 1000000000000000l>", "std::femto" );
+            strrep( function, "std::ratio<1l, 1000000000000l>", "std::pico" );
+            strrep( function, "std::ratio<1l, 1000000000l>", "std::nano" );
+            strrep( function, "std::ratio<1l, 1000000l>", "std::micro" );
+            strrep( function, "std::ratio<1l, 1000l>", "std::milli" );
+            strrep( function, "std::ratio<1l, 100l>", "std::centi" );
+            strrep( function, "std::ratio<1l, 10l>", "std::deci" );
+            strrep( function, "std::ratio<1l, 1l>", "" );
+            strrep( function, "std::ratio<10l, 1l>", "std::deca" );
+            strrep( function, "std::ratio<60l, 1l>", "std::ratio<60>" );
+            strrep( function, "std::ratio<100l, 1l>", "std::hecto" );
+            strrep( function, "std::ratio<1000l, 1l>", "std::kilo" );
+            strrep( function, "std::ratio<3600l, 1l>", "std::ratio<3600>" );
+            strrep( function, "std::ratio<1000000l, 1l>", "std::mega" );
+            strrep( function, "std::ratio<1000000000l, 1l>", "std::giga" );
+            strrep( function, "std::ratio<1000000000000l, 1l>", "std::tera" );
+            strrep( function, "std::ratio<1000000000000000l, 1l>", "std::peta" );
+            strrep( function, "std::ratio<1000000000000000000l, 1l>", "std::exa" );
+            strrep( function, "std::ratio<1000000000000000000000l, 1l>", "std::zetta" );
+            strrep( function, "std::ratio<1000000000000000000000000l, 1l>", "std::yotta" );
+            strrep( function, " >", ">" );
+            strrep( function, "< ", "<" );
+        }
+        // Replace std::chrono::duration with abbriviated version.
+        if ( function.find( "std::chrono::duration<" ) != npos ) {
+            // clang-format off
+            strrep( function, "std::chrono::duration<long, std::nano>", "std::chrono::nanoseconds" );
+            strrep( function, "std::chrono::duration<long, std::micro>", "std::chrono::microseconds" );
+            strrep( function, "std::chrono::duration<long, std::milli>", "std::chrono::milliseconds" );
+            strrep( function, "std::chrono::duration<long>", "std::chrono::seconds" );
+            strrep( function, "std::chrono::duration<long,>", "std::chrono::seconds" );
+            strrep( function, "std::chrono::duration<long, std::ratio<60>>", "std::chrono::minutes" );
+            strrep( function, "std::chrono::duration<long, std::ratio<3600>>", "std::chrono::hours" );
+            strrep( function, " >", ">" );
+            strrep( function, "< ", "<" );
+            // clang-format on
+        }
+        // Replace std::this_thread::sleep_for with abbriviated version.
+        if ( function.find( "::sleep_for<" ) != npos ) {
+            strrep( function, "::sleep_for<long, std::nano>", "::sleep_for<nanoseconds>" );
+            strrep( function, "::sleep_for<long, std::micro>", "::sleep_for<microseconds>" );
+            strrep( function, "::sleep_for<long, std::milli>", "::sleep_for<milliseconds>" );
+            strrep( function, "::sleep_for<long>", "::sleep_for<seconds>" );
+            strrep( function, "::sleep_for<long,>", "::sleep_for<seconds>" );
+            strrep( function, "::sleep_for<long, std::ratio<60>>", "::sleep_for<minutes>" );
+            strrep( function, "::sleep_for<long, std::ratio<3600>>", "::sleep_for<hours>" );
+            strrep( function,
+                    "::sleep_for<nanoseconds>(std::chrono::nanoseconds",
+                    "::sleep_for(std::chrono::nanoseconds" );
+            strrep( function,
+                    "::sleep_for<microseconds>(std::chrono::microseconds",
+                    "::sleep_for(std::chrono::microseconds" );
+            strrep( function,
+                    "::sleep_for<milliseconds>(std::chrono::milliseconds",
+                    "::sleep_for(std::chrono::milliseconds" );
+            strrep( function,
+                    "::sleep_for<seconds>(std::chrono::seconds",
+                    "::sleep_for(std::chrono::seconds" );
+            strrep( function,
+                    "::sleep_for<milliseconds>(std::chrono::minutes",
+                    "::sleep_for(std::chrono::milliseconds" );
+            strrep( function,
+                    "::sleep_for<milliseconds>(std::chrono::hours",
+                    "::sleep_for(std::chrono::hours" );
+        }
+        // Replace std::basic_string with abbriviated version
+        size_t pos = 0;
+        while ( pos < function.size() ) {
+            // Find next instance of std::basic_string
+            const std::string match = "std::basic_string<";
+            pos                     = function.find( match, pos );
+            if ( pos == npos )
+                break;
+            // Find the matching >
+            size_t pos1 = pos + match.size() - 1;
+            size_t pos2 = findMatching( function, pos1 );
+            if ( pos2 == pos1 )
+                break;
+            if ( function.substr( pos1 + 1, 4 ) == "char" )
+                function.replace( pos, pos2 - pos, "std::string" );
+            else if ( function.substr( pos1 + 1, 7 ) == "wchar_t" )
+                function.replace( pos, pos2 - pos, "std::wstring" );
+            else if ( function.substr( pos1 + 1, 8 ) == "char16_t" )
+                function.replace( pos, pos2 - pos, "std::u16string" );
+            else if ( function.substr( pos1 + 1, 8 ) == "char32_t" )
+                function.replace( pos, pos2 - pos, "std::u32string" );
+            pos++;
+        }
+        // Cleanup the children
+        cleanupStackTrace( *it );
+        ++it;
+    }
+}
diff --git a/common/StackTrace.h b/common/StackTrace.h
index f3ca5698..8d436bf7 100644
--- a/common/StackTrace.h
+++ b/common/StackTrace.h
@@ -1,14 +1,11 @@
-#ifndef included_AtomicStackTrace
-#define included_AtomicStackTrace
+#ifndef included_StackTrace
+#define included_StackTrace
 
 #include <functional>
 #include <iostream>
-#include <stdio.h>
-#include <stdlib.h>
-#include <vector>
-#include <thread>
-#include <memory>
 #include <set>
+#include <thread>
+#include <vector>
 
 
 // Check for and include MPI
@@ -39,35 +36,51 @@ struct stack_info {
     int line;
     //! Default constructor
     stack_info() : address( nullptr ), address2( nullptr ), line( 0 ) {}
+    //! Reset the stack
+    void clear();
     //! Operator==
-    bool operator==( const stack_info& rhs ) const;
+    bool operator==( const stack_info &rhs ) const;
     //! Operator!=
-    bool operator!=( const stack_info& rhs ) const;
+    bool operator!=( const stack_info &rhs ) const;
+    //! Get the minimum width to print the addresses
+    int getAddressWidth() const;
     //! Print the stack info
-    std::string print() const;
+    std::string print( int widthAddress = 16, int widthObject = 20, int widthFunction = 32 ) const;
     //! Compute the number of bytes needed to store the object
     size_t size() const;
     //! Pack the data to a byte array, returning a pointer to the end of the data
-    char* pack( char* ptr ) const;
+    char *pack( char *ptr ) const;
     //! Unpack the data from a byte array, returning a pointer to the end of the data
-    const char* unpack( const char* ptr );
+    const char *unpack( const char *ptr );
     //! Pack a vector of data to a memory block
-    static std::vector<char> packArray( const std::vector<stack_info>& data );
+    static std::vector<char> packArray( const std::vector<stack_info> &data );
     //! Unpack a vector of data from a memory block
-    static std::vector<stack_info> unpackArray( const char* data );
+    static std::vector<stack_info> unpackArray( const char *data );
 };
 
 
 struct multi_stack_info {
-    int N;
-    stack_info stack;
-    std::vector<multi_stack_info> children;
+    int N;                                  // Number of threads/processes
+    stack_info stack;                       // Current stack item
+    std::vector<multi_stack_info> children; // Children
     //! Default constructor
     multi_stack_info() : N( 0 ) {}
+    //! Construct from a simple call stack
+    explicit multi_stack_info( const std::vector<stack_info> & );
+    //! Copy constructor from a simple call stack
+    multi_stack_info &operator=( const std::vector<stack_info> & );
+    //! Reset the stack
+    void clear();
     //! Add the given stack to the multistack
-    void add( size_t N, const stack_info *stack );
+    void add( size_t len, const stack_info *stack );
     //! Print the stack info
-    std::vector<std::string> print( const std::string& prefix=std::string() ) const;
+    std::vector<std::string> print( const std::string &prefix = std::string() ) const;
+
+private:
+    void print2( const std::string &prefix, int w[3], std::vector<std::string> &text ) const;
+    int getAddressWidth() const;
+    int getObjectWidth() const;
+    int getFunctionWidth() const;
 };
 
 
@@ -95,7 +108,7 @@ std::vector<stack_info> getCallStack( std::thread::native_handle_type id );
  *    Note: This functionality may not be availible on all platforms
  * @return          Returns vector containing the stack
  */
-multi_stack_info getAllCallStacks( );
+multi_stack_info getAllCallStacks();
 
 
 /*!
@@ -107,7 +120,17 @@ multi_stack_info getAllCallStacks( );
  *    Note: This functionality may not be availible on all platforms
  * @return          Returns vector containing the stack
  */
-multi_stack_info getGlobalCallStacks( );
+multi_stack_info getGlobalCallStacks();
+
+
+/*!
+ * @brief  Clean up the stack trace
+ * @details  This function modifies the stack trace to remove entries
+ *    related to acquiring the stack trace in an attempt to make it
+ *    more useful for display/users.
+ * @param[in,out] stack     The stack trace to modify
+ */
+void cleanupStackTrace( multi_stack_info &stack );
 
 
 //! Function to return the current call stack for the current thread
@@ -136,8 +159,9 @@ std::string signalName( int signal );
  * Return the symbols from the current executable (not availible for all platforms)
  * @return      Returns 0 if sucessful
  */
-int getSymbols(
-    std::vector<void *> &address, std::vector<char> &type, std::vector<std::string> &obj );
+int getSymbols( std::vector<void *> &address,
+                std::vector<char> &type,
+                std::vector<std::string> &obj );
 
 
 /*!
@@ -159,16 +183,17 @@ enum class terminateType { signal, exception };
 
 /*!
  * Set the error handlers
- * @param[in]   Function to terminate the program: abort(msg,type)
+ * @param[in] abort     Function to terminate the program: abort(msg,type)
  */
 void setErrorHandlers( std::function<void( std::string, terminateType )> abort );
 
 
 /*!
  * Set the given signals to the handler
- * @param[in]   Function to terminate the program: abort(msg,type)
+ * @param[in] signals   Signals to handle
+ * @param[in] handler   Function to terminate the program: abort(msg,type)
  */
-void setSignals( const std::vector<int>& signals, void (*handler) (int) );
+void setSignals( const std::vector<int> &signals, void ( *handler )( int ) );
 
 
 //! Clear a signal set by setSignals
@@ -176,28 +201,28 @@ void clearSignal( int signal );
 
 
 //! Clear all signals set by setSignals
-void clearSignals( );
+void clearSignals();
 
 
 //! Return a list of all signals that can be caught
-std::vector<int> allSignalsToCatch( );
+std::vector<int> allSignalsToCatch();
 
 //! Return a default list of signals to catch
-std::vector<int> defaultSignalsToCatch( );
+std::vector<int> defaultSignalsToCatch();
 
 
 //! Get a list of the active threads
-std::set<std::thread::native_handle_type> activeThreads( );
+std::set<std::thread::native_handle_type> activeThreads();
 
 //! Get a handle to this thread
-std::thread::native_handle_type thisThread( );
+std::thread::native_handle_type thisThread();
 
 
 //! Initialize globalCallStack functionallity
 void globalCallStackInitialize( MPI_Comm comm );
 
 //! Clean up globalCallStack functionallity
-void globalCallStackFinalize( );
+void globalCallStackFinalize();
 
 
 /*!
@@ -208,9 +233,10 @@ void globalCallStackFinalize( );
  * @param[out] exit_code    Exit code returned from child process
  * @return                  Returns string containing the output
  */
-std::string exec( const std::string& cmd, int& exit_code );
+std::string exec( const std::string &cmd, int &exit_code );
 
 
 } // namespace StackTrace
 
+
 #endif
diff --git a/common/UnitTest.cpp b/common/UnitTest.cpp
index febc535c..b995fa68 100755
--- a/common/UnitTest.cpp
+++ b/common/UnitTest.cpp
@@ -1,345 +1,379 @@
-#include <iostream>
-#include <sstream>
-#include <vector>
-#include <string>
 #include "common/UnitTest.h"
 #include "common/Utilities.h"
+#include <cstring>
+#include <iostream>
+#include <sstream>
+#include <string>
+#include <vector>
 
 
-#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
-    // Windows
-    // Sleep is defined in milliseconds
-#else
-    // Linux
-    // usleep is defined in microseconds, create a Sleep command
-    #define Sleep(x) usleep(x*1000)
-#endif
-
+#define pout std::cout
+#define printp printf
 
 
 /********************************************************************
-*  Empty Constructor                                                *
-********************************************************************/
-UnitTest::UnitTest() {
-    #ifdef USE_MPI
-        comm = MPI_COMM_WORLD;
-    #endif
+ *  Constructor/Destructor                                           *
+ ********************************************************************/
+UnitTest::UnitTest()
+{
+#ifdef USE_MPI
+    comm = MPI_COMM_WORLD;
+#endif
+}
+UnitTest::~UnitTest() { reset(); }
+void UnitTest::reset()
+{
+    mutex.lock();
+    // Clear the data forcing a reallocation
+    std::vector<std::string>().swap( pass_messages );
+    std::vector<std::string>().swap( fail_messages );
+    std::vector<std::string>().swap( expected_fail_messages );
+    mutex.unlock();
 }
 
 
 /********************************************************************
-*  Print a global report                                            *
-*  Note: only rank 0 will print, all messages will be aggregated    *
-********************************************************************/
-void UnitTest::report(const int level0) {
+ *  Add a pass, fail, expected failure message in a thread-safe way  *
+ ********************************************************************/
+void UnitTest::passes( const std::string &in )
+{
+    mutex.lock();
+    pass_messages.push_back( in );
+    mutex.unlock();
+}
+void UnitTest::failure( const std::string &in )
+{
+    mutex.lock();
+    fail_messages.push_back( in );
+    mutex.unlock();
+}
+void UnitTest::expected_failure( const std::string &in )
+{
+    mutex.lock();
+    expected_fail_messages.push_back( in );
+    mutex.unlock();
+}
+
+
+/********************************************************************
+ *  Print a global report                                            *
+ *  Note: only rank 0 will print, all messages will be aggregated    *
+ ********************************************************************/
+inline std::vector<int> UnitTest::allGather( int value ) const
+{
+    int size = getSize();
+    std::vector<int> data( size, value );
+#ifdef USE_MPI
+    if ( size > 1 )
+        MPI_Allgather( &value, 1, MPI_INT, data.data(), 1, MPI_INT, comm );
+#endif
+    return data;
+}
+inline void UnitTest::barrier() const
+{
+#ifdef USE_MPI
+    if ( getSize() > 1 )
+        MPI_Barrier( comm );
+#endif
+}
+static inline void print_messages( const std::vector<std::vector<std::string>> &messages )
+{
+    if ( messages.size() > 1 ) {
+        for ( size_t i = 0; i < messages.size(); i++ ) {
+            if ( !messages[i].empty() ) {
+                printp( "     Proccessor %i:\n", static_cast<int>( i ) );
+                for ( const auto &j : messages[i] )
+                    pout << "        " << j << std::endl;
+            }
+        }
+    } else {
+        for ( const auto &j : messages[0] )
+            pout << "    " << j << std::endl;
+    }
+}
+void UnitTest::report( const int level0 ) const
+{
+    mutex.lock();
     int size = getSize();
     int rank = getRank();
     // Broadcast the print level from rank 0
     int level = level0;
-    #ifdef USE_MPI
-        if ( getSize() > 1 )
-            MPI_Bcast( &level, 1, MPI_INT, 0, comm );
-    #endif
-    if ( level<0 || level > 2 )
-        ERROR("Invalid print level");
+#ifdef USE_MPI
+    if ( getSize() > 1 )
+        MPI_Bcast( &level, 1, MPI_INT, 0, comm );
+#endif
+    if ( level < 0 || level > 2 )
+        ERROR( "Invalid print level" );
     // Perform a global all gather to get the number of failures per processor
-    std::vector<int> N_pass(size,0);
-    std::vector<int> N_fail(size,0);
-    std::vector<int> N_expected_fail(size,0);
-    int local_pass_size = (int) pass_messages.size();
-    int local_fail_size = (int) fail_messages.size();
-    int local_expected_fail_size = (int) expected_fail_messages.size();
-    if ( getSize() > 1 ) {
-        #ifdef USE_MPI
-            MPI_Allgather( &local_pass_size, 1, MPI_INT, &N_pass[0], 1, MPI_INT, comm);
-            MPI_Allgather( &local_fail_size, 1, MPI_INT, &N_fail[0], 1, MPI_INT, comm);
-            MPI_Allgather( &local_expected_fail_size, 1, MPI_INT, &N_expected_fail[0], 1, MPI_INT, comm);
-        #endif 
-    } else {
-        N_pass[0] = local_pass_size;
-        N_fail[0] = local_fail_size;
-        N_expected_fail[0] = local_expected_fail_size;
-    }
-    int N_pass_tot = 0;
+    auto N_pass             = allGather( pass_messages.size() );
+    auto N_fail             = allGather( fail_messages.size() );
+    auto N_expected_fail    = allGather( expected_fail_messages.size() );
+    int N_pass_tot          = 0;
+    int N_fail_tot          = 0;
     int N_expected_fail_tot = 0;
-    for (int i=0; i<size; i++) {
+    for ( int i = 0; i < size; i++ ) {
         N_pass_tot += N_pass[i];
+        N_fail_tot += N_fail[i];
         N_expected_fail_tot += N_expected_fail[i];
     }
     // Send all messages to rank 0 (if needed)
-    std::vector< std::vector<std::string> > pass_messages_rank(size);
-    std::vector< std::vector<std::string> > fail_messages_rank(size);
-    std::vector< std::vector<std::string> > expected_fail_rank(size);
+    std::vector<std::vector<std::string>> pass_messages_rank( size );
+    std::vector<std::vector<std::string>> fail_messages_rank( size );
+    std::vector<std::vector<std::string>> expected_fail_rank( size );
     // Get the pass messages
-    if ( ( level==1 && N_pass_tot<=20 ) || level==2 ) {
-        if ( rank==0 ) {
-            // Rank 0 should receive all messages
-            for (int i=0; i<size; i++) {
-                if ( i==0 )
-                    pass_messages_rank[i] = pass_messages;
-                else if ( N_pass[i]>0 )
-                    pass_messages_rank[i] = unpack_message_stream(i,1);
-            }
-        } else if ( pass_messages.size() ) {
-            // All other ranks send their message (use non-blocking communication)
-            pack_message_stream(pass_messages,0,1);
-        }
-    }
+    if ( ( level == 1 && N_pass_tot <= 20 ) || level == 2 )
+        pass_messages_rank = UnitTest::gatherMessages( pass_messages, 1 );
     // Get the fail messages
-    if ( level==1 || level==2 ) {
-        if ( rank==0 ) {
-            // Rank 0 should receive all messages
-            for (int i=0; i<size; i++) {
-                if ( i==0 )
-                    fail_messages_rank[i] = fail_messages;
-                else if ( N_fail[i]>0 )
-                    fail_messages_rank[i] = unpack_message_stream(i,2);
-            }
-        } else if ( !fail_messages.empty() ){
-            // All other ranks send their message (use non-blocking communication)
-            pack_message_stream(fail_messages,0,2);
-        }
-    }
+    if ( level == 1 || level == 2 )
+        fail_messages_rank = UnitTest::gatherMessages( fail_messages, 2 );
     // Get the expected_fail messages
-    if ( ( level==1 && N_expected_fail_tot<=50 ) || level==2 ) {
-        if ( rank==0 ) {
-            // Rank 0 should receive all messages
-            for (int i=0; i<size; i++) {
-                if ( i==0 )
-                    expected_fail_rank[i] = expected_fail_messages;
-                else if ( N_expected_fail[i]>0 )
-                    expected_fail_rank[i] = unpack_message_stream(i,3);
-            }
-        } else if ( !expected_fail_messages.empty() ){
-            // All other ranks send their message (use non-blocking communication)
-            pack_message_stream(expected_fail_messages,0,3);
-        }
-    }
+    if ( ( level == 1 && N_expected_fail_tot <= 50 ) || level == 2 )
+        expected_fail_rank = UnitTest::gatherMessages( expected_fail_messages, 2 );
     // Print the results of all messages (only rank 0 will print)
-    if ( rank==0 ) {
-        std::cout << std::endl;
+    if ( rank == 0 ) {
+        pout << std::endl;
         // Print the passed tests
-        std::cout << "Tests passed" << std::endl;
-        if ( level==0 || ( level==1 && N_pass_tot>20 ) ) {
+        pout << "Tests passed" << std::endl;
+        if ( level == 0 || ( level == 1 && N_pass_tot > 20 ) ) {
             // We want to print a summary
-            if ( size>8 ) {
+            if ( size > 8 ) {
                 // Print 1 summary for all processors
-                std::cout << "     " << N_pass_tot << " tests passed (use report level 2 for more detail)" << std::endl;
+                printp( "     %i tests passed (use report level 2 for more detail)\n", N_pass_tot );
             } else {
                 // Print a summary for each processor
-                for (int i=0; i<size; i++)
-                    std::cout << "     " << N_pass[i] << " tests passed (proc " << i << ") (use report level 2 for more detail)" << std::endl;
+                for ( int i = 0; i < size; i++ )
+                    printp( "     %i tests passed (proc %i) (use report level 2 for more detail)\n",
+                        N_pass[i], i );
             }
         } else {
             // We want to print all messages
-            for (int i=0; i<size; i++) {
-                ASSERT( (int)pass_messages_rank[i].size() == N_pass[i] );
-                if ( N_pass[i] > 0 ) {
-                    std::cout << "     Proccessor " << i << ":" << std::endl;
-                    for (unsigned int j=0; j<pass_messages_rank[i].size(); j++)
-                        std::cout << "        " <<  pass_messages_rank[i][j] << std::endl;
-                }
-            }
+            for ( int i = 0; i < size; i++ )
+                ASSERT( (int) pass_messages_rank[i].size() == N_pass[i] );
+            print_messages( pass_messages_rank );
         }
-        std::cout << std::endl;
+        pout << std::endl;
         // Print the tests that failed
-        std::cout << "Tests failed" << std::endl;
-        if ( level==0  ) {
+        pout << "Tests failed" << std::endl;
+        if ( level == 0 ) {
             // We want to print a summary
-            if ( size>8 ) {
+            if ( size > 8 ) {
                 // Print 1 summary for all processors
-                std::cout << "     " << N_pass_tot << " tests failed (use report level 2 for more detail)" << std::endl;
+                printp( "     %i tests failed (use report level 2 for more detail)\n", N_fail_tot );
             } else {
                 // Print a summary for each processor
-                for (int i=0; i<size; i++)
-                    std::cout << "     " << N_fail[i] << " tests failed (proc " << i << ") (use report level 1 or 2 for more detail)" << std::endl;
+                for ( int i = 0; i < size; i++ )
+                    printp( "     %i tests failed (proc %i) (use report level 2 for more detail)\n",
+                        N_fail[i], i );
             }
         } else {
             // We want to print all messages
-            for (int i=0; i<size; i++) {
-                ASSERT( (int)fail_messages_rank[i].size() == N_fail[i] );
-                if ( N_fail[i] > 0 ) {
-                    std::cout << "     Processor " << i << ":" << std::endl;
-                    for (unsigned int j=0; j<fail_messages_rank[i].size(); j++)
-                        std::cout << "        " <<  fail_messages_rank[i][j] << std::endl;
-                }
-            }
+            for ( int i = 0; i < size; i++ )
+                ASSERT( (int) fail_messages_rank[i].size() == N_fail[i] );
+            print_messages( fail_messages_rank );
         }
-        std::cout << std::endl;
+        pout << std::endl;
         // Print the tests that expected failed
-        std::cout << "Tests expected failed" << std::endl;
-        if ( level==0 || ( level==1 && N_expected_fail_tot>50 ) ) {
+        pout << "Tests expected failed" << std::endl;
+        if ( level == 0 || ( level == 1 && N_expected_fail_tot > 50 ) ) {
             // We want to print a summary
-            if ( size>8 ) {
+            if ( size > 8 ) {
                 // Print 1 summary for all processors
-                std::cout << "     " << N_expected_fail_tot << " tests expected failed (use report level 2 for more detail)" << std::endl;
+                printp( "     %i tests expected failed (use report level 2 for more detail)\n",
+                    N_expected_fail_tot );
             } else {
                 // Print a summary for each processor
-                for (int i=0; i<size; i++)
-                    std::cout << "     " << N_expected_fail[i] << " tests expected failed (proc " << i << ") (use report level 1 or 2 for more detail)" << std::endl;
+                for ( int i = 0; i < size; i++ )
+                    printp( "     %i tests expected failed (proc %i) (use report level 2 for more "
+                            "detail)\n",
+                        N_expected_fail[i], i );
             }
         } else {
             // We want to print all messages
-            for (int i=0; i<size; i++) {
-                ASSERT( (int)expected_fail_rank[i].size() == N_expected_fail[i] );
-                if ( N_expected_fail[i] > 0 ) {
-                    std::cout << "     Processor " << i << ":" << std::endl;
-                    for (unsigned int j=0; j<expected_fail_rank[i].size(); j++)
-                        std::cout << "        " <<  expected_fail_rank[i][j] << std::endl;
-                }
-            }
+            for ( int i = 0; i < size; i++ )
+                ASSERT( (int) expected_fail_rank[i].size() == N_expected_fail[i] );
+            print_messages( expected_fail_rank );
         }
-        std::cout << std::endl;
+        pout << std::endl;
     }
     // Add a barrier to synchronize all processors (rank 0 is much slower)
-    #ifdef USE_MPI
-        if ( getSize() > 1 )
-            MPI_Barrier(comm);
-    #endif
-}
-
-
-
-/********************************************************************
-*  Pack and send the given messages                                 *
-********************************************************************/
-void UnitTest::pack_message_stream(const std::vector<std::string>& messages, const int rank, const int tag)
-{
-    #ifdef USE_MPI
-        // Get the size of the messages
-        int N_messages = (int) messages.size();
-        int *msg_size = new int[N_messages];
-        int msg_size_tot = 0;
-        for (int i=0; i<N_messages; i++) {
-            msg_size[i] = (int) messages[i].size();
-            msg_size_tot += msg_size[i];
-        }
-        // Allocate space for the message stream
-        int size_data = (N_messages+1)*sizeof(int)+msg_size_tot;
-        char *data = new char[size_data];
-        // Pack the message stream
-        int *tmp = (int*) data;
-        tmp[0] = N_messages;
-        for (int i=0; i<N_messages; i++)
-            tmp[i+1] = msg_size[i];
-        int k = (N_messages+1)*sizeof(int);
-        for (int i=0; i<N_messages; i++) {
-            messages[i].copy(&data[k],msg_size[i]);
-            k += msg_size[i];
-        }
-        // Send the message stream (using a non-blocking send)
-        MPI_Request request;
-        MPI_Isend( data, size_data, MPI_CHAR, rank, tag, comm, &request );
-        // Wait for the communication to send and free the temporary memory
-        MPI_Status status;
-        MPI_Wait( &request, &status );
-        delete [] data;
-        delete [] msg_size;
-    #endif
+    barrier();
+    Utilities::sleep_ms( 10 ); // Need a brief pause to allow any printing to finish
+    mutex.unlock();
 }
 
 
 /********************************************************************
-*  receive and unpack a message stream                              *
-********************************************************************/
-std::vector<std::string> UnitTest::unpack_message_stream(const int rank, const int tag)
+ *  Gather the messages to rank 0                                    *
+ ********************************************************************/
+std::vector<std::vector<std::string>> UnitTest::gatherMessages(
+    const std::vector<std::string> &local_messages, int tag ) const
 {
-    #ifdef USE_MPI
-        // Probe the message to get the message size
-        MPI_Status status;
-        MPI_Probe(rank,tag,comm,&status);
-        int size_data=-1;
-        MPI_Get_count(&status,MPI_BYTE,&size_data);
-        ASSERT(size_data>=0);
-        // Allocate memory to receive the data
-        char *data = new char[size_data];
-        // receive the data (using a non-blocking receive)
-        MPI_Request request;
-        MPI_Irecv( data, size_data, MPI_CHAR, rank, tag, comm, &request );
-        // Wait for the communication to be received
-        MPI_Wait( &request, &status );
-        // Unpack the message stream
-        int *tmp = (int*) data;
-        int N_messages = tmp[0];
-        int *msg_size = &tmp[1];
-        std::vector<std::string> messages(N_messages);
-        int k = (N_messages+1)*sizeof(int);
-        for (int i=0; i<N_messages; i++) {
-            messages[i] = std::string(&data[k],msg_size[i]);
-            k += msg_size[i];
+    const int rank = getRank();
+    const int size = getSize();
+    std::vector<std::vector<std::string>> messages( size );
+    if ( rank == 0 ) {
+        // Rank 0 should receive all messages
+        for ( int i = 0; i < size; i++ ) {
+            if ( i == 0 )
+                messages[i] = local_messages;
+            else
+                messages[i] = unpack_message_stream( i, tag );
         }
-        // Delete the temporary memory
-        delete [] data;
-        return messages;
-    #else
+    } else {
+        // All other ranks send their message (use non-blocking communication)
+        pack_message_stream( local_messages, 0, tag );
+    }
+    return messages;
+}
+
+
+/********************************************************************
+ *  Pack and send the given messages                                 *
+ ********************************************************************/
+void UnitTest::pack_message_stream(
+    const std::vector<std::string> &messages, const int rank, const int tag ) const
+{
+#ifdef USE_MPI
+    // Get the size of the messages
+    auto N_messages  = (int) messages.size();
+    auto *msg_size   = new int[N_messages];
+    int msg_size_tot = 0;
+    for ( int i = 0; i < N_messages; i++ ) {
+        msg_size[i] = (int) messages[i].size();
+        msg_size_tot += msg_size[i];
+    }
+    // Allocate space for the message stream
+    size_t size_data = ( N_messages + 1 ) * sizeof( int ) + msg_size_tot;
+    auto *data       = new char[size_data];
+    // Pack the message stream
+    memcpy( data, &N_messages, sizeof( int ) );
+    memcpy( &data[sizeof( int )], msg_size, N_messages * sizeof( int ) );
+    size_t k = ( N_messages + 1 ) * sizeof( int );
+    for ( int i = 0; i < N_messages; i++ ) {
+        messages[i].copy( &data[k], msg_size[i] );
+        k += msg_size[i];
+    }
+    // Send the message stream (using a non-blocking send)
+    MPI_Request request;
+    MPI_Isend( data, size_data, MPI_CHAR, rank, tag, comm, &request );
+    // Wait for the communication to send and free the temporary memory
+    MPI_Status status;
+    MPI_Wait( &request, &status );
+    delete[] data;
+    delete[] msg_size;
+#else
+    NULL_USE( messages );
+    NULL_USE( rank );
+    NULL_USE( tag );
+#endif
+}
+
+
+/********************************************************************
+ *  Receive and unpack a message stream                              *
+ ********************************************************************/
+std::vector<std::string> UnitTest::unpack_message_stream( const int rank, const int tag ) const
+{
+#ifdef USE_MPI
+    // Probe the message to get the message size
+    MPI_Status status;
+    MPI_Probe( rank, tag, comm, &status );
+    int size_data = -1;
+    MPI_Get_count( &status, MPI_BYTE, &size_data );
+    ASSERT( size_data >= 0 );
+    // Allocate memory to receive the data
+    auto *data = new char[size_data];
+    // receive the data (using a non-blocking receive)
+    MPI_Request request;
+    MPI_Irecv( data, size_data, MPI_CHAR, rank, tag, comm, &request );
+    // Wait for the communication to be received
+    MPI_Wait( &request, &status );
+    // Unpack the message stream
+    int N_messages = 0;
+    memcpy( &N_messages, data, sizeof( int ) );
+    if ( N_messages == 0 ) {
+        delete[] data;
         return std::vector<std::string>();
-    #endif
+    }
+    std::vector<int> msg_size( N_messages );
+    std::vector<std::string> messages( N_messages );
+    memcpy( msg_size.data(), &data[sizeof( int )], N_messages * sizeof( int ) );
+    int k = ( N_messages + 1 ) * sizeof( int );
+    for ( int i = 0; i < N_messages; i++ ) {
+        messages[i] = std::string( &data[k], msg_size[i] );
+        k += msg_size[i];
+    }
+    delete[] data;
+    return messages;
+#else
+    NULL_USE( rank );
+    NULL_USE( tag );
+    return std::vector<std::string>();
+#endif
 }
 
 
 /********************************************************************
-*  Other functions                                                  *
-********************************************************************/
-int UnitTest::getRank()
+ *  Other functions                                                  *
+ ********************************************************************/
+int UnitTest::getRank() const
 {
     int rank = 0;
-    #ifdef USE_MPI
-        int flag=0;
-        MPI_Initialized(&flag);
-        if ( flag )
-            MPI_Comm_rank( comm, &rank );
-    #endif
+#ifdef USE_MPI
+    int flag = 0;
+    MPI_Initialized( &flag );
+    if ( flag )
+        MPI_Comm_rank( comm, &rank );
+#endif
     return rank;
 }
-int UnitTest::getSize()
+int UnitTest::getSize() const
 {
     int size = 1;
-    #ifdef USE_MPI
-        int flag=0;
-        MPI_Initialized(&flag);
-        if ( flag )
-            MPI_Comm_size( comm, &size );
-    #endif
+#ifdef USE_MPI
+    int flag = 0;
+    MPI_Initialized( &flag );
+    if ( flag )
+        MPI_Comm_size( comm, &size );
+#endif
     return size;
 }
-size_t UnitTest::NumPassGlobal()
+size_t UnitTest::NumPassGlobal() const
 {
     size_t num = pass_messages.size();
-    #ifdef USE_MPI
-        if ( getSize() > 1 ) {
-            int send = static_cast<int>(num);
-            int sum = 0;
-            MPI_Allreduce( &send, &sum, 1, MPI_INT, MPI_SUM, comm );
-            num = static_cast<size_t>(sum);
-        }
-    #endif
+#ifdef USE_MPI
+    if ( getSize() > 1 ) {
+        auto send = static_cast<int>( num );
+        int sum   = 0;
+        MPI_Allreduce( &send, &sum, 1, MPI_INT, MPI_SUM, comm );
+        num = static_cast<size_t>( sum );
+    }
+#endif
     return num;
 }
-size_t UnitTest::NumFailGlobal()
+size_t UnitTest::NumFailGlobal() const
 {
     size_t num = fail_messages.size();
-    #ifdef USE_MPI
-        if ( getSize() > 1 ) {
-            int send = static_cast<int>(num);
-            int sum = 0;
-            MPI_Allreduce( &send, &sum, 1, MPI_INT, MPI_SUM, comm );
-            num = static_cast<size_t>(sum);
-        }
-    #endif
+#ifdef USE_MPI
+    if ( getSize() > 1 ) {
+        auto send = static_cast<int>( num );
+        int sum   = 0;
+        MPI_Allreduce( &send, &sum, 1, MPI_INT, MPI_SUM, comm );
+        num = static_cast<size_t>( sum );
+    }
+#endif
     return num;
 }
-size_t UnitTest::NumExpectedFailGlobal()
+size_t UnitTest::NumExpectedFailGlobal() const
 {
     size_t num = expected_fail_messages.size();
-    #ifdef USE_MPI
-        if ( getSize() > 1 ) {
-            int send = static_cast<int>(num);
-            int sum = 0;
-            MPI_Allreduce( &send, &sum, 1, MPI_INT, MPI_SUM, comm );
-            num = static_cast<size_t>(sum);
-        }
-    #endif
+#ifdef USE_MPI
+    if ( getSize() > 1 ) {
+        auto send = static_cast<int>( num );
+        int sum   = 0;
+        MPI_Allreduce( &send, &sum, 1, MPI_INT, MPI_SUM, comm );
+        num = static_cast<size_t>( sum );
+    }
+#endif
     return num;
 }
-
-
diff --git a/common/UnitTest.h b/common/UnitTest.h
index 365bac35..80503d19 100755
--- a/common/UnitTest.h
+++ b/common/UnitTest.h
@@ -1,11 +1,12 @@
 #ifndef included_UnitTest
 #define included_UnitTest
 
+#include <mutex>
 #include <sstream>
-#include <vector>
 #include <string>
+#include <vector>
 #ifdef USE_MPI
-    #include "mpi.h"
+#include "mpi.h"
 #endif
 
 
@@ -27,78 +28,92 @@
  * \endcode
 
  */
-class UnitTest {
+class UnitTest
+{
 public:
-
     //! Constructor
     UnitTest();
 
-    //! Indicate a passed test
-    virtual void passes (const std::string &in) { pass_messages.push_back(in); }
+    //! Destructor
+    virtual ~UnitTest();
 
-    //! Indicate a failed test
-    virtual void failure (const std::string &in) { fail_messages.push_back(in); }
+    //! Indicate a passed test (thread-safe)
+    virtual void passes( const std::string &in );
 
-    //! Indicate an expected failed test
-    virtual void expected_failure (const std::string &in) { expected_fail_messages.push_back(in); }
+    //! Indicate a failed test (thread-safe)
+    virtual void failure( const std::string &in );
+
+    //! Indicate an expected failed test (thread-safe)
+    virtual void expected_failure( const std::string &in );
 
     //! Return the number of passed tests locally
-    virtual size_t NumPassLocal () { return pass_messages.size(); }
+    virtual size_t NumPassLocal() const { return pass_messages.size(); }
 
     //! Return the number of failed tests locally
-    virtual size_t NumFailLocal () { return fail_messages.size(); }
+    virtual size_t NumFailLocal() const { return fail_messages.size(); }
 
     //! Return the number of expected failed tests locally
-    virtual size_t NumExpectedFailLocal () { return expected_fail_messages.size(); }
+    virtual size_t NumExpectedFailLocal() const { return expected_fail_messages.size(); }
 
     //! Return the number of passed tests locally
-    virtual size_t NumPassGlobal ();
+    virtual size_t NumPassGlobal() const;
 
     //! Return the number of failed tests locally
-    virtual size_t NumFailGlobal ();
+    virtual size_t NumFailGlobal() const;
 
     //! Return the number of expected failed tests locally
-    virtual size_t NumExpectedFailGlobal ();
+    virtual size_t NumExpectedFailGlobal() const;
 
     //! Return the rank of the current processor
-    int getRank ();
+    int getRank() const;
 
     //! Return the number of processors
-    int getSize ();
+    int getSize() const;
 
     /*!
      * Print a report of the passed and failed tests.
      * Note: This is a blocking call that all processors must execute together.
-     * Note: Only rank 0 will print the messages (this is necessary as other ranks may not be able to print correctly).
+     * Note: Only rank 0 will print the messages (this is necessary as other ranks may not be able
+     * to print correctly).
      * @param level     Optional integer specifying the level of reporting (default: 1)
      *                  0: Report the number of tests passed, failed, and expected failures.
-     *                  1: Report the number of passed tests (if <=20) or the number passed otherwise,
-     *                     report all failures,
-     *                     report the number of expected failed tests (if <=50) or the number passed otherwise.
+     *                  1: Report the number of passed tests (if <=20) or the number passed
+     *                     otherwise, report all failures, report the number of expected
+     *                     failed tests (if <=50) or the number passed otherwise.
      *                  2: Report all passed, failed, and expected failed tests.
      */
-     virtual void report(const int level=1);
+    virtual void report( const int level = 1 ) const;
+
+    //! Clear the messages
+    void reset();
 
 protected:
     std::vector<std::string> pass_messages;
     std::vector<std::string> fail_messages;
     std::vector<std::string> expected_fail_messages;
-    #ifdef USE_MPI
-        MPI_Comm comm;
-    #endif
+    mutable std::mutex mutex;
+#ifdef USE_MPI
+    MPI_Comm comm;
+#endif
 
 private:
     // Make the copy constructor private
-    UnitTest(const UnitTest& p) {}
+    UnitTest( const UnitTest & ) {}
 
     // Function to pack the messages into a single data stream and send to the given processor
     // Note: This function does not return until the message stream has been sent
-    void pack_message_stream(const std::vector<std::string>& messages, const int rank, const int tag);
+    void pack_message_stream(
+        const std::vector<std::string> &messages, const int rank, const int tag ) const;
 
     // Function to unpack the messages from a single data stream
     // Note: This function does not return until the message stream has been received
-    std::vector<std::string> unpack_message_stream(const int rank, const int tag);
+    std::vector<std::string> unpack_message_stream( const int rank, const int tag ) const;
 
+    // Helper functions
+    inline void barrier() const;
+    inline std::vector<int> allGather( int value ) const;
+    inline std::vector<std::vector<std::string>> gatherMessages(
+        const std::vector<std::string> &local_messages, int tag ) const;
 };
 
 
diff --git a/common/Utilities.h b/common/Utilities.h
index e1f1713d..e6db4279 100644
--- a/common/Utilities.h
+++ b/common/Utilities.h
@@ -1,74 +1,107 @@
 #ifndef included_Utilities
 #define included_Utilities
 
+#include <chrono>
+#include <cstdarg>
+#include <iostream>
+#include <mutex>
 #include <stdio.h>
 #include <stdlib.h>
-#include <iostream>
+#include <sys/stat.h>
+#include <thread>
 #include <vector>
 
 
+namespace Utilities {
+
 
 /*!
- * Utilities is a Singleton class containing basic routines for error 
- * reporting, file manipulations, etc.  Included are a set of \ref Macros "macros" that are commonly used.
+ * Aborts the run after printing an error message with file and
+ * linenumber information.
  */
-namespace Utilities
-{
-
-    /*!
-     * Aborts the run after printing an error message with file and
-     * linenumber information.
-     */
-    void abort(const std::string &message, const std::string &filename, const int line);
+void abort( const std::string &message, const std::string &filename, const int line );
 
 
-    /*!
-     * Set the behavior of abort
-     * @param printMemory       Print the current memory usage (default is true)
-     * @param printStack        Print the current call stack (default is true)
-     * @param throwException    Throw an exception instead of MPI_Abort (default is false)
-     */
-    void setAbortBehavior( bool printMemory, bool printStack, bool throwException );
+/*!
+ * Set the behavior of abort
+ * @param printMemory       Print the current memory usage (default is true)
+ * @param printStack        Print the current call stack (default is true)
+ * @param throwException    Throw an exception instead of MPI_Abort (default is false)
+ */
+void setAbortBehavior( bool printMemory, bool printStack, bool throwException );
 
-    //! Function to set the error handlers
-    void setErrorHandlers();
-
-    /*!
-     * Function to get the memory availible.
-     * This function will return the total memory availible
-     * Note: depending on the implimentation, this number may be rounded to
-     * to a multiple of the page size.
-     * If this function fails, it will return 0.
-     */
-    size_t getSystemMemory();
-
-    /*!
-     * Function to get the memory usage.
-     * This function will return the total memory used by the application.
-     * Note: depending on the implimentation, this number may be rounded to
-     * to a multiple of the page size.
-     * If this function fails, it will return 0.
-     */
-    size_t getMemoryUsage();
+//! Function to set the error handlers
+void setErrorHandlers();
 
 
-    //! Function to get an arbitrary point in time
-    double time();
+/*!
+ * Function to get the memory availible.
+ * This function will return the total memory availible
+ * Note: depending on the implimentation, this number may be rounded to
+ * to a multiple of the page size.
+ * If this function fails, it will return 0.
+ */
+size_t getSystemMemory();
 
-    //! Function to get the resolution of time
-    double tick();
 
-    //! Factor a number into it's prime factors
-    std::vector<int> factor(size_t number);
+/*!
+ * Function to get the memory usage.
+ * This function will return the total memory used by the application.
+ * Note: depending on the implimentation, this number may be rounded to
+ * to a multiple of the page size.
+ * If this function fails, it will return 0.
+ */
+size_t getMemoryUsage();
 
-    //! Print AMP Banner
-    void nullUse( void* );
+
+//! Function to get an arbitrary point in time
+double time();
+
+
+//! Function to get the resolution of time
+double tick();
+
+
+//! std::string version of sprintf
+inline std::string stringf( const char *format, ... );
+
+
+/*!
+ * Sleep for X ms
+ * @param N         Time to sleep (ms)
+ */
+inline void sleep_ms( int N ) { std::this_thread::sleep_for( std::chrono::milliseconds( N ) ); }
+
+
+/*!
+ * Sleep for X s
+ * @param N         Time to sleep (s)
+ */
+inline void sleep_s( int N ) { std::this_thread::sleep_for( std::chrono::seconds( N ) ); }
+
+
+//! Factor a number into it's prime factors
+std::vector<int> factor(size_t number);
+
+//! Print AMP Banner
+void nullUse( void* );
 
 } // namespace Utilities
 
 
 #include "common/UtilityMacros.h"
 
+
+// stringf
+inline std::string Utilities::stringf( const char *format, ... )
+{
+    va_list ap;
+    va_start( ap, format );
+    char tmp[4096];
+    vsprintf( tmp, format, ap );
+    va_end( ap );
+    return std::string( tmp );
+}
+
+
 #endif
-
-
diff --git a/common/UtilityMacros.h b/common/UtilityMacros.h
index 2165b1d5..bfac172f 100644
--- a/common/UtilityMacros.h
+++ b/common/UtilityMacros.h
@@ -9,8 +9,8 @@
 #include <stdexcept>
 
 
-/*! \defgroup Macros Set of utility macro functions 
- *  \details  These functions are a list of C++ macros that are used 
+/*! \defgroup Macros Set of utility macro functions
+ *  \details  These functions are a list of C++ macros that are used
  *     for common operations, including checking for errors.
  *  \addtogroup Macros
  *  @{
@@ -19,13 +19,19 @@
 
 /*! \def NULL_STATEMENT
  *  \brief    A null statement
- *  \details  A statement that does nothing, for insure++ make it something 
+ *  \details  A statement that does nothing, for insure++ make it something
  * more complex than a simple C null statement to avoid a warning.
  */
+#ifndef NULL_STATEMENT
 #ifdef __INSURE__
-    #define NULL_STATEMENT do{if(0) int nullstatement=0 }}while(0)
+#define NULL_STATEMENT            \
+    do {                          \
+        if ( 0 )                  \
+            int nullstatement = 0 \
+    } while ( 0 )
 #else
-    #define NULL_STATEMENT
+#define NULL_STATEMENT
+#endif
 #endif
 
 
@@ -34,9 +40,15 @@
  *  \details  A null use of a variable, use to avoid GNU compiler warnings about unused variables.
  *  \param variable  Variable to pretend to use
  */
-#define NULL_USE(variable) do {                         \
-    if(0) {char *temp = (char *)&variable; temp++;}     \
-}while(0)
+#ifndef NULL_USE
+#define NULL_USE( variable )                \
+    do {                                    \
+        if ( 0 ) {                          \
+            auto temp = (char *) &variable; \
+            temp++;                         \
+        }                                   \
+    } while ( 0 )
+#endif
 
 
 /*! \def ERROR(MSG)
@@ -46,9 +58,10 @@
  *     line number of the abort are also printed.
  *  \param MSG  Error message to print
  */
-#define ERROR(MSG) do {                                 \
-    ::Utilities::abort(MSG,__FILE__,__LINE__);            \
-}while(0)
+#define ERROR(MSG)                                                \
+    do {                                                          \
+        ::Utilities::abort( MSG, __FILE__, __LINE__ );            \
+    } while ( 0 )
 
 
 /*! \def WARNING(MSG)
@@ -56,11 +69,13 @@
  *  \details Print a warning without exit.  Print file and line number of the warning.
  *  \param MSG  Warning message to print
  */
-#define WARNING(MSG) do {                                           \
-    std::stringstream tboxos;                                       \
-    tboxos << MSG << std::ends;                                     \
-    printf("WARNING: %s\n   Warning called in %s on line %i\n",tboxos.str().c_str(),__FILE__,__LINE__); \
-}while(0)
+#define WARNING(MSG)                                                    \
+    do {                                                                \
+        std::stringstream tboxos;                                       \
+        tboxos << MSG << std::ends;                                     \
+        printf("WARNING: %s\n   Warning called in %s on line %i\n",     \
+            tboxos.str().c_str(),__FILE__,__LINE__);                    \
+    }while(0)
 
 
 /*! \def ASSERT(EXP)
@@ -71,13 +86,14 @@
  *     The file and line number of the abort are printed along with the stack trace (if availible).
  *  \param EXP  Expression to evaluate
  */
-#define ASSERT(EXP) do {                                            \
-    if ( !(EXP) ) {                                                 \
-        std::stringstream tboxos;                                   \
-        tboxos << "Failed assertion: " << #EXP << std::ends;        \
-        ::Utilities::abort(tboxos.str(), __FILE__, __LINE__);         \
-    }                                                               \
-}while(0)
+#define ASSERT(EXP)                                                     \
+    do {                                                                \
+        if ( !(EXP) ) {                                                 \
+            std::stringstream tboxos;                                   \
+            tboxos << "Failed assertion: " << #EXP << std::ends;        \
+            ::Utilities::abort(tboxos.str(), __FILE__, __LINE__);       \
+        }                                                               \
+    }while(0)
 
 
 /*! \def INSIST(EXP,MSG)
@@ -99,7 +115,6 @@
 }while(0)
 
 
-
 /**
  * Macro for use when assertions are to be included
  * only when debugging.
@@ -118,6 +133,49 @@
 #endif
 
 
+/*! \def DISABLE_WARNINGS
+ *  \brief Reenable warnings
+ *  \details This will re-enable warnings after a call to DIASABLE_WARNINGS
+ */
+/*! \def ENABLE_WARNINGS
+ *  \brief Supress all warnings
+ *  \details This will start to supress all compile warnings.
+ *      Be sure to follow with ENABLE_WARNINGS
+ */
+// clang-format off
+#ifdef DISABLE_WARNINGS
+    // Macros previously defined
+#elif defined( USING_MSVC )
+    #define DISABLE_WARNINGS __pragma( warning( push, 0 ) )
+    #define ENABLE_WARNINGS __pragma( warning( pop ) )
+#elif defined( USING_CLANG )
+    #define DISABLE_WARNINGS                                                \
+        _Pragma( "clang diagnostic push" ) _Pragma( "clang diagnostic ignored \"-Wall\"" ) \
+        _Pragma( "clang diagnostic ignored \"-Wextra\"" )                   \
+        _Pragma( "clang diagnostic ignored \"-Wunused-private-field\"" )    \
+        _Pragma( "clang diagnostic ignored \"-Wmismatched-new-delete\"" )
+    #define ENABLE_WARNINGS _Pragma( "clang diagnostic pop" )
+#elif defined( USING_GCC )
+    // Note: We cannot disable the -Wliteral-suffix message with this macro because the
+    // pragma command cannot suppress warnings from the C++ preprocessor.  See gcc bug #53431.
+    #define DISABLE_WARNINGS                                                \
+        _Pragma( "GCC diagnostic push" ) _Pragma( "GCC diagnostic ignored \"-Wall\"" ) \
+        _Pragma( "GCC diagnostic ignored \"-Wextra\"" )                     \
+        _Pragma( "GCC diagnostic ignored \"-Wpragmas\"" )                     \
+        _Pragma( "GCC diagnostic ignored \"-Wunused-local-typedefs\"" )     \
+        _Pragma( "GCC diagnostic ignored \"-Woverloaded-virtual\"" )        \
+        _Pragma( "GCC diagnostic ignored \"-Wunused-parameter\"" )          \
+        _Pragma( "GCC diagnostic ignored \"-Warray-bounds\"" )              \
+        _Pragma( "GCC diagnostic ignored \"-Wterminate\"" )
+    #define ENABLE_WARNINGS _Pragma( "GCC diagnostic pop" )
+#else
+    #define DISABLE_WARNINGS
+    #define ENABLE_WARNINGS
+#endif
+// clang-format on
+
+
+
 /*! @} */
 
 
diff --git a/tests/lbpm_color_simulator.h b/tests/lbpm_color_simulator.h
index 626ef757..3d48655e 100644
--- a/tests/lbpm_color_simulator.h
+++ b/tests/lbpm_color_simulator.h
@@ -9,9 +9,24 @@
 #define ANALYSIS_INTERVAL 1000
 #define BLOBID_INTERVAL 1000
 
-enum AnalysisType{ AnalyzeNone=0, IdentifyBlobs=0x01, CopyPhaseIndicator=0x02, 
+
+enum class AnalysisType : uint64_t { AnalyzeNone=0, IdentifyBlobs=0x01, CopyPhaseIndicator=0x02, 
     CopySimState=0x04, ComputeAverages=0x08, CreateRestart=0x10, WriteVis=0x20 };
 
+AnalysisType& operator |=(AnalysisType &lhs, AnalysisType rhs)  
+{
+    lhs = static_cast<AnalysisType> (
+        static_cast<std::underlying_type<AnalysisType>::type>(lhs) |
+        static_cast<std::underlying_type<AnalysisType>::type>(rhs)           
+    );
+    return lhs;
+}
+bool matches( AnalysisType x, AnalysisType y )
+{
+    return static_cast<std::underlying_type<AnalysisType>::type>(x) &
+        static_cast<std::underlying_type<AnalysisType>::type>(y) != 0;
+}
+
 
 template<class TYPE>
 void DeleteArray( const TYPE *p )
@@ -30,7 +45,7 @@ struct AnalysisWaitIdStruct {
 
 
 // Helper class to write the restart file from a seperate thread
-class WriteRestartWorkItem: public ThreadPool::WorkItem
+class WriteRestartWorkItem: public ThreadPool::WorkItemRet<void>
 {
 public:
     WriteRestartWorkItem( const char* filename_, std::shared_ptr<double> cDen_,
@@ -41,7 +56,6 @@ public:
         WriteCheckpoint(filename,cDen.get(),cfq.get(),N);
         PROFILE_STOP("Save Checkpoint",1);
     };
-    virtual bool has_result() const { return false; }
 private:
     WriteRestartWorkItem();
     const char* filename;
@@ -54,7 +68,7 @@ private:
 static const std::string id_map_filename = "lbpm_id_map.txt";
 typedef std::shared_ptr<std::pair<int,IntArray> > BlobIDstruct;
 typedef std::shared_ptr<std::vector<BlobIDType> > BlobIDList;
-class BlobIdentificationWorkItem1: public ThreadPool::WorkItem
+class BlobIdentificationWorkItem1: public ThreadPool::WorkItemRet<void>
 {
 public:
     BlobIdentificationWorkItem1( int timestep_, int Nx_, int Ny_, int Nz_, const RankInfoStruct& rank_info_, 
@@ -75,7 +89,6 @@ public:
         new_index->first = ComputeGlobalBlobIDs(Nx-2,Ny-2,Nz-2,rank_info,*phase,dist,vF,vS,ids,newcomm);
         PROFILE_STOP("Identify blobs",1);
     }
-    virtual bool has_result() const { return false; }
 private:
     BlobIdentificationWorkItem1();
     int timestep;
@@ -87,7 +100,7 @@ private:
     BlobIDList new_list;
     MPI_Comm newcomm;
 };
-class BlobIdentificationWorkItem2: public ThreadPool::WorkItem
+class BlobIdentificationWorkItem2: public ThreadPool::WorkItemRet<void>
 {
 public:
     BlobIdentificationWorkItem2( int timestep_, int Nx_, int Ny_, int Nz_, const RankInfoStruct& rank_info_, 
@@ -122,7 +135,6 @@ public:
         }
         PROFILE_STOP("Identify blobs maps",1);
     }
-    virtual bool has_result() const { return false; }
 private:
     BlobIdentificationWorkItem2();
     int timestep;
@@ -137,7 +149,7 @@ private:
 
 
 // Helper class to write the vis file from a thread
-class WriteVisWorkItem: public ThreadPool::WorkItem
+class WriteVisWorkItem: public ThreadPool::WorkItemRet<void>
 {
 public:
     WriteVisWorkItem( int timestep_, std::vector<IO::MeshDataStruct>& visData_,
@@ -164,7 +176,6 @@ public:
         IO::writeData( timestep, visData, newcomm );
         PROFILE_STOP("Save Vis",1);
     };
-    virtual bool has_result() const { return false; }
 private:
     WriteVisWorkItem();
     int timestep;
@@ -177,7 +188,7 @@ private:
 
 // Helper class to run the analysis from within a thread
 // Note: Averages will be modified after the constructor is called
-class AnalysisWorkItem: public ThreadPool::WorkItem
+class AnalysisWorkItem: public ThreadPool::WorkItemRet<void>
 {
 public:
     AnalysisWorkItem( AnalysisType type_, int timestep_, TwoPhase& Averages_, 
@@ -191,10 +202,10 @@ public:
         Averages.Label_NWP_map = *id_list;
         Averages.NumberComponents_WP = 1;
         Averages.Label_WP.fill(0.0);
-        if ( (type&CopyPhaseIndicator) != 0 ) {
+        if ( matches(type,AnalysisType::CopyPhaseIndicator) ) {
             // Averages.ColorToSignedDistance(beta,Averages.Phase,Averages.Phase_tplus);
         }
-        if ( (type&ComputeAverages) != 0 ) {
+        if ( matches(type,AnalysisType::ComputeAverages) ) {
             PROFILE_START("Compute dist",1);
             Averages.Initialize();
             Averages.ComputeDelPhi();
@@ -212,7 +223,6 @@ public:
             PROFILE_STOP("Compute dist",1);
         }
     }
-    virtual bool has_result() const { return false; }
 private:
     AnalysisWorkItem();
     AnalysisType type;
@@ -223,6 +233,7 @@ private:
     double beta;
 };
 
+
 // Function to start the analysis
 void run_analysis( int timestep, int restart_interval, 
     const RankInfoStruct& rank_info, ScaLBL_Communicator &ScaLBL_Comm, TwoPhase& Averages,
@@ -236,46 +247,45 @@ void run_analysis( int timestep, int restart_interval,
     int N = Nx*Ny*Nz;
 
     // Determin the analysis we want to perform
-    AnalysisType type = AnalyzeNone;
+    AnalysisType type = AnalysisType::AnalyzeNone;
     if ( timestep%ANALYSIS_INTERVAL + 5 == ANALYSIS_INTERVAL ) {
         // Copy the phase indicator field for the earlier timestep
-        type = static_cast<AnalysisType>( type | CopyPhaseIndicator );
+        type |= AnalysisType::CopyPhaseIndicator;
     }
     if ( timestep%BLOBID_INTERVAL == 0 ) {
         // Identify blobs and update global ids in time
-        type = static_cast<AnalysisType>( type | IdentifyBlobs );
+        type |= AnalysisType::IdentifyBlobs;
     }
-    /*    #ifdef USE_CUDA
+    /*#ifdef USE_CUDA
         if ( tpool.getQueueSize()<=3 && tpool.getNumThreads()>0 && timestep%50==0 ) {
             // Keep a few blob identifications queued up to keep the processors busy,
             // allowing us to track the blobs as fast as possible
             // Add more detailed estimates of the update frequency required to track blobs
-            type = static_cast<AnalysisType>( type | IdentifyBlobs );
+            type |= AnalysisType::IdentifyBlobs;
         }
-    #endif
-    */
+    #endif */
     if ( timestep%ANALYSIS_INTERVAL == 0 ) {
         // Copy the averages to the CPU (and identify blobs)
-        type = static_cast<AnalysisType>( type | CopySimState );
-        type = static_cast<AnalysisType>( type | IdentifyBlobs );
+        type |= AnalysisType::CopySimState;
+        type |= AnalysisType::IdentifyBlobs;
     }
     if ( timestep%ANALYSIS_INTERVAL == 5 ) {
         // Run the analysis
-        type = static_cast<AnalysisType>( type | ComputeAverages );
+        type |= AnalysisType::ComputeAverages;
     }
     if (timestep%restart_interval == 0) {
         // Write the restart file
-        type = static_cast<AnalysisType>( type | CreateRestart );
+        type |= AnalysisType::CreateRestart;
     }
     if (timestep%restart_interval == 0) {
         // Write the visualization data
-        type = static_cast<AnalysisType>( type | WriteVis );
-        type = static_cast<AnalysisType>( type | CopySimState );
-        type = static_cast<AnalysisType>( type | IdentifyBlobs );
+        type |= AnalysisType::WriteVis;
+        type |= AnalysisType::CopySimState;
+        type |= AnalysisType::IdentifyBlobs;
     }
     
     // Return if we are not doing anything
-    if ( type == AnalyzeNone )
+    if ( type == AnalysisType::AnalyzeNone )
         return;
 
     PROFILE_START("start_analysis");
@@ -284,26 +294,28 @@ void run_analysis( int timestep, int restart_interval,
     ScaLBL_DeviceBarrier();
     PROFILE_START("Copy data to host",1);
     std::shared_ptr<DoubleArray> phase;
-    if ( (type&CopyPhaseIndicator)!=0 || (type&ComputeAverages)!=0 ||
-         (type&CopySimState)!=0 || (type&IdentifyBlobs)!=0 )
+    if ( matches(type,AnalysisType::CopyPhaseIndicator) ||
+         matches(type,AnalysisType::ComputeAverages) ||
+         matches(type,AnalysisType::CopySimState) || 
+         matches(type,AnalysisType::IdentifyBlobs) )
     {
         phase = std::shared_ptr<DoubleArray>(new DoubleArray(Nx,Ny,Nz));
         ScaLBL_CopyToHost(phase->data(),Phi,N*sizeof(double));
     }
-    if ( (type&CopyPhaseIndicator)!=0 ) {
+    if ( matches(type,AnalysisType::CopyPhaseIndicator) ) {
         memcpy(Averages.Phase_tplus.data(),phase->data(),N*sizeof(double));
         //Averages.ColorToSignedDistance(beta,Averages.Phase,Averages.Phase_tplus);
     }
-    if ( (type&ComputeAverages)!=0 ) {
+    if ( matches(type,AnalysisType::ComputeAverages) ) {
         memcpy(Averages.Phase_tminus.data(),phase->data(),N*sizeof(double));
         //Averages.ColorToSignedDistance(beta,Averages.Phase,Averages.Phase_tminus);
     }
-    if ( (type&CopySimState) != 0 ) {
+    if ( matches(type,AnalysisType::CopySimState) ) {
         // Copy the members of Averages to the cpu (phase was copied above)
         // Wait 
         PROFILE_START("Copy-Pressure",1);
-		ScaLBL_D3Q19_Pressure(fq,Pressure,Np);
-    	ScaLBL_D3Q19_Momentum(fq,Velocity,Np);
+        ScaLBL_D3Q19_Pressure(fq,Pressure,Np);
+        ScaLBL_D3Q19_Momentum(fq,Velocity,Np);
         ScaLBL_DeviceBarrier();
         PROFILE_STOP("Copy-Pressure",1);
         PROFILE_START("Copy-Wait",1);
@@ -312,14 +324,14 @@ void run_analysis( int timestep, int restart_interval,
         PROFILE_STOP("Copy-Wait",1);
         PROFILE_START("Copy-State",1);
         memcpy(Averages.Phase.data(),phase->data(),N*sizeof(double));
-		ScaLBL_Comm.RegularLayout(Map,Pressure,Averages.Press);
-		ScaLBL_Comm.RegularLayout(Map,&Velocity[0],Averages.Vel_x);
-		ScaLBL_Comm.RegularLayout(Map,&Velocity[Np],Averages.Vel_y);
-		ScaLBL_Comm.RegularLayout(Map,&Velocity[2*Np],Averages.Vel_z);
+        ScaLBL_Comm.RegularLayout(Map,Pressure,Averages.Press);
+        ScaLBL_Comm.RegularLayout(Map,&Velocity[0],Averages.Vel_x);
+        ScaLBL_Comm.RegularLayout(Map,&Velocity[Np],Averages.Vel_y);
+        ScaLBL_Comm.RegularLayout(Map,&Velocity[2*Np],Averages.Vel_z);
         PROFILE_STOP("Copy-State",1);
     }
     std::shared_ptr<double> cDen, cfq;
-    if ( (type&CreateRestart) != 0 ) {
+    if ( matches(type,AnalysisType::CreateRestart) ) {
         // Copy restart data to the CPU
         cDen = std::shared_ptr<double>(new double[2*Np],DeleteArray<double>);
         cfq = std::shared_ptr<double>(new double[19*Np],DeleteArray<double>);
@@ -329,14 +341,14 @@ void run_analysis( int timestep, int restart_interval,
     PROFILE_STOP("Copy data to host",1);
 
     // Spawn threads to do blob identification work
-    if ( (type&IdentifyBlobs)!=0 ) {
+    if ( matches(type,AnalysisType::IdentifyBlobs) ) {
         BlobIDstruct new_index(new std::pair<int,IntArray>(0,IntArray()));
         BlobIDstruct new_ids(new std::pair<int,IntArray>(0,IntArray()));
         BlobIDList new_list(new std::vector<BlobIDType>());
-        ThreadPool::WorkItem *work1 = new BlobIdentificationWorkItem1(timestep,
-            Nx,Ny,Nz,rank_info,phase,Averages.SDs,last_ids,new_index,new_ids,new_list);
-        ThreadPool::WorkItem *work2 = new BlobIdentificationWorkItem2(timestep,
-            Nx,Ny,Nz,rank_info,phase,Averages.SDs,last_ids,new_index,new_ids,new_list);
+        auto work1 = new BlobIdentificationWorkItem1(timestep,Nx,Ny,Nz,rank_info,
+            phase,Averages.SDs,last_ids,new_index,new_ids,new_list);
+        auto work2 = new BlobIdentificationWorkItem2(timestep,Nx,Ny,Nz,rank_info,
+            phase,Averages.SDs,last_ids,new_index,new_ids,new_list);
         work1->add_dependency(wait.blobID);
         work2->add_dependency(tpool.add_work(work1));
         wait.blobID = tpool.add_work(work2);
@@ -346,9 +358,8 @@ void run_analysis( int timestep, int restart_interval,
     }
 
     // Spawn threads to do the analysis work
-    if ( (type&ComputeAverages) != 0 ) {
-        ThreadPool::WorkItem *work = new AnalysisWorkItem(
-            type,timestep,Averages,last_index,last_id_map,beta);
+    if ( matches(type,AnalysisType::ComputeAverages) ) {
+        auto work = new AnalysisWorkItem(type,timestep,Averages,last_index,last_id_map,beta);
         work->add_dependency(wait.blobID);
         work->add_dependency(wait.analysis);
         work->add_dependency(wait.vis);     // Make sure we are done using analysis before modifying
@@ -356,35 +367,35 @@ void run_analysis( int timestep, int restart_interval,
     }
 
     // Spawn a thread to write the restart file
-    if ( (type&CreateRestart) != 0 ) {
+    if ( matches(type,AnalysisType::CreateRestart) ) {
         int rank = MPI_WORLD_RANK();
-        //if (pBC) {
-            //err = fabs(sat_w - sat_w_previous);
-            //sat_w_previous = sat_w;
-	  //if (rank==0){
-	  // printf("Timestep %i: change in saturation since last checkpoint is %f \n",timestep,err);
-	  // }
-	  // }
+        /* if (pBC) {
+            err = fabs(sat_w - sat_w_previous);
+            sat_w_previous = sat_w;
+            if (rank==0){
+               printf("Timestep %i: change in saturation since last checkpoint is %f \n",timestep,err);
+           }
+        } */
         // Wait for previous restart files to finish writing (not necessary, but helps to ensure memory usage is limited)
         tpool.wait(wait.restart);
-	// Retain the timestep associated with the restart files
-	if (rank==0){
-	  FILE *Rst = fopen("Restart.txt","w");
-	  fprintf(Rst,"%i\n",timestep+5);
-	  fclose(Rst);
-	}
+        // Retain the timestep associated with the restart files
+        if (rank==0) {
+            FILE *Rst = fopen("Restart.txt","w");
+            fprintf(Rst,"%i\n",timestep+5);
+            fclose(Rst);
+        }
         // Write the restart file (using a seperate thread)
-        WriteRestartWorkItem *work = new WriteRestartWorkItem(LocalRestartFile,cDen,cfq,Np);
+        auto work = new WriteRestartWorkItem(LocalRestartFile,cDen,cfq,Np);
         work->add_dependency(wait.restart);
         wait.restart = tpool.add_work(work);
     }
 
     // Save the results for visualization
-    if ( (type&CreateRestart) != 0 ) {
+    if ( matches(type,AnalysisType::CreateRestart) ) {
         // Wait for previous restart files to finish writing (not necessary, but helps to ensure memory usage is limited)
         tpool.wait(wait.vis);
         // Write the vis files
-        ThreadPool::WorkItem *work = new WriteVisWorkItem( timestep, visData, Averages, fillData );
+        auto work = new WriteVisWorkItem( timestep, visData, Averages, fillData );
         work->add_dependency(wait.blobID);
         work->add_dependency(wait.analysis);
         work->add_dependency(wait.vis);
diff --git a/threadpool/atomic_helpers.cpp b/threadpool/atomic_helpers.cpp
index 1cac8e83..574cd30e 100644
--- a/threadpool/atomic_helpers.cpp
+++ b/threadpool/atomic_helpers.cpp
@@ -27,4 +27,3 @@ int atomic_pthread_lock_initialized = create_atomic_pthread_lock();
 
 } // AtomicOperations namespace
 
-
diff --git a/threadpool/atomic_helpers.h b/threadpool/atomic_helpers.h
index 5e8c4cfb..178c1af1 100644
--- a/threadpool/atomic_helpers.h
+++ b/threadpool/atomic_helpers.h
@@ -5,7 +5,6 @@
 #include <stdint.h>
 #include <stdio.h>
 #include <typeinfo>
-#include <stdexcept>
 
 // Choose the OS
 #if defined( WIN32 ) || defined( _WIN32 ) || defined( WIN64 ) || defined( _WIN64 )
@@ -89,6 +88,16 @@ inline int32_atomic atomic_get( const int32_atomic volatile *x );
  */
 inline int64_atomic atomic_get( const int64_atomic volatile *x );
 
+
+/**
+ * \brief Get the value
+ * \details Read the data in x
+ * \param[in] x     The pointer to the value to get
+ */
+template<class TYPE>
+inline TYPE *atomic_get( volatile TYPE **x );
+
+
 /**
  * \brief Set the value
  * \details Set the data in x to y (*x=y)
@@ -185,9 +194,8 @@ inline bool atomic_compare_and_swap( void *volatile *v, void *x, void *y );
  * \brief Fetch the current value and "and" with given value
  * \details Perform *v = (*v) & x, returning the previous value
  * \return Returns the previous value before the "and" operation
- * \param[in] v     The pointer to the value to check and swap
- * \param[in] x     The value to compare
- * \param[in] y     The value to swap iff *v==x
+ * \param[in] v     The pointer to the value to check and and
+ * \param[in] x     The value to and
  */
 inline int32_atomic atomic_fetch_and_and( int32_atomic volatile *v, int32_atomic x );
 
@@ -195,9 +203,8 @@ inline int32_atomic atomic_fetch_and_and( int32_atomic volatile *v, int32_atomic
  * \brief Fetch the current value and "and" with given value
  * \details Perform *v = (*v) & x, returning the previous value
  * \return Returns the previous value before the "and" operation
- * \param[in] v     The pointer to the value to check and swap
- * \param[in] x     The value to compare
- * \param[in] y     The value to swap iff *v==x
+ * \param[in] v     The pointer to the value to check and and
+ * \param[in] x     The value to and
  */
 inline int64_atomic atomic_fetch_and_and( int64_atomic volatile *v, int64_atomic x );
 
@@ -205,9 +212,8 @@ inline int64_atomic atomic_fetch_and_and( int64_atomic volatile *v, int64_atomic
  * \brief Fetch the current value and "or" with given value
  * \details Perform *v = (*v) | x, returning the previous value
  * \return Returns the previous value before the "and" operation
- * \param[in] v     The pointer to the value to check and swap
- * \param[in] x     The value to compare
- * \param[in] y     The value to swap iff *v==x
+ * \param[in] v     The pointer to the value to check and or
+ * \param[in] x     The value to or
  */
 inline int32_atomic atomic_fetch_and_or( int32_atomic volatile *v, int32_atomic x );
 
@@ -216,52 +222,52 @@ inline int32_atomic atomic_fetch_and_or( int32_atomic volatile *v, int32_atomic
  * \details Perform *v = (*v) | x, returning the previous value
  * \return Returns the previous value before the "and" operation
  * \param[in] v     The pointer to the value to check and swap
- * \param[in] x     The value to compare
- * \param[in] y     The value to swap iff *v==x
+ * \param[in] v     The pointer to the value to check and or
+ * \param[in] x     The value to or
  */
 inline int64_atomic atomic_fetch_and_or( int64_atomic volatile *v, int64_atomic x );
 
 
-
 /**
  * \brief Class to store a pool of objects
  * \details This class stores a pool of objects that can be added/removed in a thread-safe way
  */
-template<class TYPE,int N_MAX>
+template<class TYPE, int N_MAX>
 class pool
 {
-  public:
-    pool( )
+public:
+    pool()
     {
-        d_data = new volatile TYPE*[N_MAX];
-        for (int i=0; i<N_MAX; i++)
+        d_data = new volatile TYPE *[N_MAX];
+        for ( int i = 0; i < N_MAX; i++ )
             d_data[i] = new TYPE;
     }
-    ~pool( )
+    ~pool()
     {
-        for (int i=0; i<N_MAX; i++)
+        for ( int i = 0; i < N_MAX; i++ )
             if ( d_data[i] != nullptr )
                 delete d_data[i];
-        delete [] d_data;
+        delete[] d_data;
     }
-    inline TYPE* get()
-    {
-        int i=0;
-        while ( true ) {
-            TYPE* tmp = const_cast<TYPE*>( d_data[i] );
-            bool swapped = atomic_compare_and_swap( (void* volatile*) &d_data[i], tmp, nullptr );
-            if ( swapped && ( tmp != nullptr ) )
-                return tmp;
-            i = (i+1)%N_MAX;
-        }
-    }
-    inline void put( TYPE* ptr )
+    inline TYPE *get()
     {
         int i = 0;
-        while ( !atomic_compare_and_swap( (void* volatile*) &d_data[i], nullptr, ptr ) )
-            i = (i+1)%N_MAX;
+        while ( true ) {
+            TYPE *tmp    = const_cast<TYPE *>( d_data[i] );
+            bool swapped = atomic_compare_and_swap( (void *volatile *) &d_data[i], tmp, nullptr );
+            if ( swapped && ( tmp != nullptr ) )
+                return tmp;
+            i = ( i + 1 ) % N_MAX;
+        }
     }
-  private:
+    inline void put( TYPE *ptr )
+    {
+        int i = 0;
+        while ( !atomic_compare_and_swap( (void *volatile *) &d_data[i], nullptr, ptr ) )
+            i = ( i + 1 ) % N_MAX;
+    }
+
+private:
     volatile TYPE **d_data;
     pool( const pool &rhs );
     pool &operator=( const pool &rhs );
@@ -323,10 +329,24 @@ inline int64_atomic atomic_decrement( int64_atomic volatile *x )
 {
     return OSAtomicDecrement64Barrier( x );
 }
-int32_atomic atomic_fetch_and_or( int32_atomic volatile *v, int32_atomic x ) { return OSAtomicOr32Orig( x, (volatile uint32_t *) v ); }
-int32_atomic atomic_fetch_and_and( int32_atomic volatile *v, int32_atomic x ) { return OSAtomicAnd32Orig( x, (volatile uint32_t *) v); }
-int64_atomic atomic_fetch_and_or( int64_atomic volatile *v, int64_atomic x ) { throw std::logic_error("Not availible for this OS"); return 0; }
-int64_atomic atomic_fetch_and_and( int64_atomic volatile *v, int64_atomic x ) { throw std::logic_error("Not availible for this OS"); return 0; }
+int32_atomic atomic_fetch_and_or( int32_atomic volatile *v, int32_atomic x )
+{
+    return OSAtomicOr32Orig( x, (volatile uint32_t *) v );
+}
+int32_atomic atomic_fetch_and_and( int32_atomic volatile *v, int32_atomic x )
+{
+    return OSAtomicAnd32Orig( x, (volatile uint32_t *) v );
+}
+int64_atomic atomic_fetch_and_or( int64_atomic volatile *v, int64_atomic x )
+{
+    throw std::logic_error( "Not availible for this OS" );
+    return 0;
+}
+int64_atomic atomic_fetch_and_and( int64_atomic volatile *v, int64_atomic x )
+{
+    throw std::logic_error( "Not availible for this OS" );
+    return 0;
+}
 inline int32_atomic atomic_add( int32_atomic volatile *x, int32_atomic y )
 {
     return OSAtomicAdd32Barrier( y, x );
@@ -352,10 +372,22 @@ int32_atomic atomic_increment( int32_atomic volatile *x ) { return __sync_add_an
 int64_atomic atomic_increment( int64_atomic volatile *x ) { return __sync_add_and_fetch( x, 1 ); }
 int32_atomic atomic_decrement( int32_atomic volatile *x ) { return __sync_sub_and_fetch( x, 1 ); }
 int64_atomic atomic_decrement( int64_atomic volatile *x ) { return __sync_sub_and_fetch( x, 1 ); }
-int32_atomic atomic_fetch_and_or( int32_atomic volatile *v, int32_atomic x ) { return __sync_fetch_and_or( v, x ); }
-int64_atomic atomic_fetch_and_or( int64_atomic volatile *v, int64_atomic x ) { return __sync_fetch_and_or( v, x ); }
-int32_atomic atomic_fetch_and_and( int32_atomic volatile *v, int32_atomic x ) { return __sync_fetch_and_and( v, x ); }
-int64_atomic atomic_fetch_and_and( int64_atomic volatile *v, int64_atomic x ) { return __sync_fetch_and_and( v, x ); }
+int32_atomic atomic_fetch_and_or( int32_atomic volatile *v, int32_atomic x )
+{
+    return __sync_fetch_and_or( v, x );
+}
+int64_atomic atomic_fetch_and_or( int64_atomic volatile *v, int64_atomic x )
+{
+    return __sync_fetch_and_or( v, x );
+}
+int32_atomic atomic_fetch_and_and( int32_atomic volatile *v, int32_atomic x )
+{
+    return __sync_fetch_and_and( v, x );
+}
+int64_atomic atomic_fetch_and_and( int64_atomic volatile *v, int64_atomic x )
+{
+    return __sync_fetch_and_and( v, x );
+}
 inline int32_atomic atomic_add( int32_atomic volatile *x, int32_atomic y )
 {
     return __sync_add_and_fetch( x, y );
@@ -459,31 +491,44 @@ inline int64_atomic atomic_get( const int64_atomic volatile *x )
 {
     return atomic_add( const_cast<int64_atomic volatile *>( x ), 0 );
 }
+template<class TYPE>
+inline TYPE *atomic_get( volatile TYPE **x )
+{
+    return reinterpret_cast<TYPE *>(
+        atomic_add( reinterpret_cast<int64_atomic volatile *>( x ), 0 ) );
+}
 inline void atomic_set( int32_atomic volatile *x, int32_atomic y )
 {
     int32_atomic tmp = *x;
-    while ( !atomic_compare_and_swap( x, tmp, y ) ) { tmp = *x; }
+    while ( !atomic_compare_and_swap( x, tmp, y ) ) {
+        tmp = *x;
+    }
 }
 inline void atomic_set( int64_atomic volatile *x, int64_atomic y )
 {
     int64_atomic tmp = *x;
-    while ( !atomic_compare_and_swap( x, tmp, y ) ) { tmp = *x; }
+    while ( !atomic_compare_and_swap( x, tmp, y ) ) {
+        tmp = *x;
+    }
 }
 inline void atomic_swap( int32_atomic volatile *x, int32_atomic *y )
 {
     int32_atomic tmp = *x;
-    while ( !atomic_compare_and_swap( x, tmp, *y ) ) { tmp = *x; }
+    while ( !atomic_compare_and_swap( x, tmp, *y ) ) {
+        tmp = *x;
+    }
     *y = tmp;
 }
 inline void atomic_swap( int64_atomic volatile *x, int64_atomic *y )
 {
     int64_atomic tmp = *x;
-    while ( !atomic_compare_and_swap( x, tmp, *y ) ) { tmp = *x; }
+    while ( !atomic_compare_and_swap( x, tmp, *y ) ) {
+        tmp = *x;
+    }
     *y = tmp;
 }
 
 
-
 // Define an atomic counter
 struct counter_t {
 public:
@@ -499,6 +544,7 @@ public:
     inline void setCount( int val ) { count = val; }
     // Get the current value of the count
     inline int getCount() const { return count; }
+
 private:
     counter_t( const counter_t & );
     counter_t &operator=( const counter_t & );
diff --git a/threadpool/atomic_list.h b/threadpool/atomic_list.h
index d3c73f2e..5da8cc85 100644
--- a/threadpool/atomic_list.h
+++ b/threadpool/atomic_list.h
@@ -1,52 +1,48 @@
 #ifndef included_AtomicModelAtomicList
 #define included_AtomicModelAtomicList
 
-#include <functional>
-#include <csignal>
 #include <atomic>
+#include <csignal>
+#include <functional>
 
 #include "threadpool/atomic_helpers.h"
 
 
-
 /** \class AtomicList
  *
- * \brief Maintain a sorted list of entries 
+ * \brief Maintain a sorted list of entries
  * \details This class implements a basic sorted list that is thread-safe and lock-free.
  *    Entries are stored smallest to largest according to the compare operator
  */
-template< class TYPE, int MAX_SIZE, class COMPARE = std::less<TYPE> >
+template<class TYPE, int MAX_SIZE, class COMPARE = std::less<TYPE>>
 class AtomicList final
 {
 public:
     //! Default constructor
-    AtomicList( const TYPE& default_value=TYPE(), const COMPARE& comp=COMPARE() );
+    AtomicList( const TYPE &default_value = TYPE(), const COMPARE &comp = COMPARE() );
 
     /*!
      * \brief   Remove an item from the list
      * \details Find and remove first entry that meets the given criteria
-     * @return          Return the item that matches the criteria, or the default item if no item matches
-     * @param comp	 	Comparison function object (i.e. an object that satisfies
+     * @return          Return the item that matches the criteria,
+     *                  or the default item if no item matches
+     * @param compare   Comparison function object (i.e. an object that satisfies
      *                  the requirements of Compare) which returns ​true if the
      *                  given value meets the selection criteria.
      *                  The signature of the comparison function should be equivalent to:
      *                      bool cmp( const TYPE& value, ... );
+     * @param args      Additional arguments for the comparison
      */
-    template<class Compare, class ... Args>
+    template<class Compare, class... Args>
     inline TYPE remove( Compare compare, Args... args );
 
     //! Remove the first from the list
-    inline TYPE remove_first( );
+    inline TYPE remove_first();
 
     /*!
      * \brief   Insert an item
      * \details Insert an item into the list
      * @param x         Item to insert
-     * @param comp	 	Comparison function object (i.e. an object that satisfies
-     *                  the requirements of Compare) which returns ​true if the
-     *                  first argument is less than (i.e. is ordered before) the second. 
-     *                  The signature of the comparison function should be equivalent to:
-     *                      bool cmp(const TYPE &a, const TYPE &b);
      */
     inline void insert( TYPE x );
 
@@ -54,19 +50,19 @@ public:
      * \brief   Return the size of the list
      * \details Return the number of items in the list
      */
-    inline int size( ) const { return AtomicOperations::atomic_get(&d_N); }
+    inline int size() const { return AtomicOperations::atomic_get( &d_N ); }
 
     /*!
      * \brief   Check if the list is empty
      * \details Return true if the list is empty
      */
-    inline bool empty( ) const { return AtomicOperations::atomic_get(&d_N)==0; }
+    inline bool empty() const { return AtomicOperations::atomic_get( &d_N ) == 0; }
 
     /*!
      * \brief   Return the capacity of the list
      * \details Return the maximum number of items the list can hold
      */
-    inline int capacity( ) const { return MAX_SIZE; }
+    inline int capacity() const { return MAX_SIZE; }
 
     /*!
      * \brief   Check the list
@@ -76,15 +72,15 @@ public:
      *    It is intended for debugging purposes only!
      * @return          This function returns true if the list is in a good working state
      */
-    inline bool check( );
+    inline bool check();
 
 
     //! Return the total number of inserts since object creation
-    inline int64_t N_insert() const { return AtomicOperations::atomic_get(&d_N_insert); }
+    inline int64_t N_insert() const { return AtomicOperations::atomic_get( &d_N_insert ); }
 
 
     //! Return the total number of removals since object creation
-    inline int64_t N_remove() const { return AtomicOperations::atomic_get(&d_N_remove); }
+    inline int64_t N_remove() const { return AtomicOperations::atomic_get( &d_N_remove ); }
 
 private:
     // Data members
@@ -92,7 +88,7 @@ private:
     volatile TYPE d_default;
     volatile TYPE d_objects[MAX_SIZE];
     volatile AtomicOperations::int32_atomic d_N;
-    volatile AtomicOperations::int32_atomic d_next[MAX_SIZE+1];
+    volatile AtomicOperations::int32_atomic d_next[MAX_SIZE + 1];
     volatile AtomicOperations::int32_atomic d_unused;
     volatile AtomicOperations::int64_atomic d_N_insert;
     volatile AtomicOperations::int64_atomic d_N_remove;
@@ -112,12 +108,12 @@ private:
         if ( i != -1 )
             AtomicOperations::atomic_fetch_and_or( &d_next[i], value );
     }
-    inline int get_unused( )
+    inline int get_unused()
     {
         int i = 0;
         while ( i == 0 )
             i = AtomicOperations::atomic_fetch_and_and( &d_unused, 0 );
-        AtomicOperations::atomic_fetch_and_or( &d_unused, -(d_next[i]+4)+1 );
+        AtomicOperations::atomic_fetch_and_or( &d_unused, -( d_next[i] + 4 ) + 1 );
         d_next[i] = -3;
         return i;
     }
@@ -126,14 +122,14 @@ private:
         int j = 0;
         while ( j == 0 )
             AtomicOperations::atomic_swap( &d_unused, &j );
-        d_next[i] = -3-j;
+        d_next[i] = -3 - j;
         AtomicOperations::atomic_fetch_and_or( &d_unused, i );
     }
 
 
 private:
-    AtomicList( const AtomicList& );
-    AtomicList& operator=( const AtomicList& );
+    AtomicList( const AtomicList & );
+    AtomicList &operator=( const AtomicList & );
 };
 
 
@@ -142,7 +138,7 @@ private:
  * \brief Pool allocator
  * \details This class implements a basic fast pool allocator that is thread-safe.
  */
-template< class TYPE, class INT_TYPE=int >
+template<class TYPE, class INT_TYPE = int>
 class MemoryPool final
 {
 public:
@@ -150,21 +146,21 @@ public:
     explicit MemoryPool( size_t size );
 
     //! destructor
-    ~MemoryPool( );
+    ~MemoryPool();
 
     /*!
      * \brief   Allocate an object
      * \details Allocates a new object from the pool
      * @return          Return the new pointer, or nullptr if there is no more room in the pool
      */
-    inline TYPE* allocate( );
+    inline TYPE *allocate();
 
     /*!
      * \brief   Insert an item
      * \details Insert an item into the list
      * @param ptr       The pointer to free
      */
-    inline void free( TYPE* ptr );
+    inline void free( TYPE *ptr );
 
 private:
     // Data members
@@ -172,13 +168,11 @@ private:
     volatile AtomicOperations::int32_atomic d_next;
 
 private:
-    MemoryPool( const MemoryPool& );
-    MemoryPool& operator=( const MemoryPool& );
+    MemoryPool( const MemoryPool & );
+    MemoryPool &operator=( const MemoryPool & );
 };
 
 
-
-
 #include "threadpool/atomic_list.hpp"
 
 #endif
diff --git a/threadpool/atomic_list.hpp b/threadpool/atomic_list.hpp
index 877d953f..a0850971 100644
--- a/threadpool/atomic_list.hpp
+++ b/threadpool/atomic_list.hpp
@@ -2,41 +2,39 @@
 #define included_AtomicList_hpp
 
 
-#include <stdexcept>
 #include <iostream>
+#include <stdexcept>
 #include <thread>
 
 
-
 /******************************************************************
-* Constructor                                                     *
-******************************************************************/
-template<class TYPE,int MAX_SIZE,class COMPARE>
-AtomicList<TYPE,MAX_SIZE,COMPARE>::AtomicList( const TYPE& default_value, const COMPARE& comp ):
-    d_compare(comp),
-    d_default(default_value)
+ * Constructor                                                     *
+ ******************************************************************/
+template<class TYPE, int MAX_SIZE, class COMPARE>
+AtomicList<TYPE, MAX_SIZE, COMPARE>::AtomicList( const TYPE &default_value, const COMPARE &comp )
+    : d_compare( comp ), d_default( default_value )
 {
-    d_N = 0;
-    d_next[0] = -1;
-    d_unused = 1;
+    d_N        = 0;
+    d_next[0]  = -1;
+    d_unused   = 1;
     d_N_insert = 0;
     d_N_remove = 0;
-    for (int i=0; i<MAX_SIZE; i++) {
-        d_next[i+1] = -5-i;
-        d_objects[i] = d_default;
+    for ( int i = 0; i < MAX_SIZE; i++ ) {
+        d_next[i + 1] = -5 - i;
+        d_objects[i]  = d_default;
     }
 }
 
 
 /******************************************************************
-* Remove an item                                                  *
-******************************************************************/
-template<class TYPE,int MAX_SIZE,class COMPARE>
-template<class Compare, class ... Args>
-inline TYPE AtomicList<TYPE,MAX_SIZE,COMPARE>::remove( Compare compare, Args... args )
+ * Remove an item                                                  *
+ ******************************************************************/
+template<class TYPE, int MAX_SIZE, class COMPARE>
+template<class Compare, class... Args>
+inline TYPE AtomicList<TYPE, MAX_SIZE, COMPARE>::remove( Compare compare, Args... args )
 {
-    // Acquiring temporary ownership 
-    int pos = 0;
+    // Acquiring temporary ownership
+    int pos   = 0;
     auto next = lock( 0 );
     while ( true ) {
         if ( next == -1 ) {
@@ -50,9 +48,10 @@ inline TYPE AtomicList<TYPE,MAX_SIZE,COMPARE>::remove( Compare compare, Args...
         // Acquire ownership of the next item
         int next2 = lock( next );
         // Test to see if the object passes compare
-        bool test = compare( const_cast<TYPE&>(d_objects[next-1]), args... );
+        bool test = compare( const_cast<TYPE &>( d_objects[next - 1] ), args... );
         if ( test ) {
-            // We want to return this object, update next to point to another entry and remove the entry
+            // We want to return this object, update next to point to another entry and remove the
+            // entry
             unlock( next, -3 );
             unlock( pos, next2 );
             pos = next;
@@ -60,28 +59,28 @@ inline TYPE AtomicList<TYPE,MAX_SIZE,COMPARE>::remove( Compare compare, Args...
         }
         // Release the ownership and move on
         unlock( pos, next );
-        pos = next;
+        pos  = next;
         next = next2;
     }
-    TYPE rtn(d_default);
+    TYPE rtn( d_default );
     if ( pos != -1 ) {
-        std::swap( rtn, const_cast<TYPE&>( d_objects[pos-1] ) );
+        std::swap( rtn, const_cast<TYPE &>( d_objects[pos - 1] ) );
         put_unused( pos );
         AtomicOperations::atomic_decrement( &d_N );
         AtomicOperations::atomic_increment( &d_N_remove );
     }
     return rtn;
 }
-template<class TYPE,int MAX_SIZE,class COMPARE>
-inline TYPE AtomicList<TYPE,MAX_SIZE,COMPARE>::remove_first( )
+template<class TYPE, int MAX_SIZE, class COMPARE>
+inline TYPE AtomicList<TYPE, MAX_SIZE, COMPARE>::remove_first()
 {
-    TYPE rtn(d_default);
+    TYPE rtn( d_default );
     auto next = lock( 0 );
     if ( next != -1 ) {
         int next2 = lock( next );
         unlock( next, -3 );
         unlock( 0, next2 );
-        std::swap( rtn, const_cast<TYPE&>( d_objects[next-1] ) );
+        std::swap( rtn, const_cast<TYPE &>( d_objects[next - 1] ) );
         put_unused( next );
         AtomicOperations::atomic_decrement( &d_N );
         AtomicOperations::atomic_increment( &d_N_remove );
@@ -93,10 +92,10 @@ inline TYPE AtomicList<TYPE,MAX_SIZE,COMPARE>::remove_first( )
 
 
 /******************************************************************
-* Insert an item                                                  *
-******************************************************************/
-template<class TYPE,int MAX_SIZE,class COMPARE>
-inline void AtomicList<TYPE,MAX_SIZE,COMPARE>::insert( TYPE x )
+ * Insert an item                                                  *
+ ******************************************************************/
+template<class TYPE, int MAX_SIZE, class COMPARE>
+inline void AtomicList<TYPE, MAX_SIZE, COMPARE>::insert( TYPE x )
 {
     int N_used = AtomicOperations::atomic_increment( &d_N );
     if ( N_used > MAX_SIZE ) {
@@ -105,14 +104,14 @@ inline void AtomicList<TYPE,MAX_SIZE,COMPARE>::insert( TYPE x )
     }
     // Get an index to store the entry
     auto index = get_unused();
-    if ( index<1 )
+    if ( index < 1 )
         throw std::logic_error( "Internal error" );
     // Store the object in d_objects
     AtomicOperations::atomic_increment( &d_N_insert );
-    d_objects[index-1] = x;
-    d_next[index] = -1;
+    d_objects[index - 1] = x;
+    d_next[index]        = -1;
     // Find the position to store and update the next entires
-    int pos = 0;
+    int pos   = 0;
     auto next = lock( pos );
     while ( true ) {
         // Get the next item in the list (acquiring temporary ownership)
@@ -122,7 +121,7 @@ inline void AtomicList<TYPE,MAX_SIZE,COMPARE>::insert( TYPE x )
             break;
         }
         // Test to see if the object is < the value being compared
-        bool test = d_compare.operator()( x, const_cast<TYPE&>(d_objects[next-1]) );
+        bool test = d_compare.operator()( x, const_cast<TYPE &>( d_objects[next - 1] ) );
         if ( test ) {
             // We want to store this object before next
             d_next[index] = next;
@@ -131,35 +130,35 @@ inline void AtomicList<TYPE,MAX_SIZE,COMPARE>::insert( TYPE x )
         }
         // Release the ownership and move on
         int last = pos;
-        pos = next;
-        next = lock( next );
+        pos      = next;
+        next     = lock( next );
         unlock( last, pos );
     }
 }
 
 
 /******************************************************************
-* Check the internal structures of the list                       *
-* This is mostly thread-safe, but blocks all threads              *
-******************************************************************/
-template<class TYPE,int MAX_SIZE,class COMPARE>
-inline bool AtomicList<TYPE,MAX_SIZE,COMPARE>::check( )
+ * Check the internal structures of the list                       *
+ * This is mostly thread-safe, but blocks all threads              *
+ ******************************************************************/
+template<class TYPE, int MAX_SIZE, class COMPARE>
+inline bool AtomicList<TYPE, MAX_SIZE, COMPARE>::check()
 {
     // Get the lock and check for any other threads modifying the list
     auto start = lock( 0 );
-    std::this_thread::sleep_for( std::chrono::microseconds(100) );
+    std::this_thread::sleep_for( std::chrono::microseconds( 100 ) );
     // Perform the checks on the list
-    bool pass = true;
-    int N1 = 0;
-    int N2 = 0;
+    bool pass    = true;
+    int N1       = 0;
+    int N2       = 0;
     int N_unused = 0;
-    int N_tail = 0;
-    for (int i=0; i<MAX_SIZE; i++) {
+    int N_tail   = 0;
+    for ( int i = 0; i < MAX_SIZE; i++ ) {
         if ( d_objects[i] != d_default )
             N1++;
     }
-    for (int i=0; i<MAX_SIZE+1; i++) {
-        int next = i==0 ? start:d_next[i];
+    for ( int i = 0; i < MAX_SIZE + 1; i++ ) {
+        int next = i == 0 ? start : d_next[i];
         if ( next > 0 ) {
             N2++;
         } else if ( next < -3 ) {
@@ -170,71 +169,70 @@ inline bool AtomicList<TYPE,MAX_SIZE,COMPARE>::check( )
             pass = false;
         }
     }
-    pass = pass && N_tail==1 && N1==d_N && N2==d_N && N_unused+d_N==MAX_SIZE;
-    int it = 0;
+    pass    = pass && N_tail == 1 && N1 == d_N && N2 == d_N && N_unused + d_N == MAX_SIZE;
+    int it  = 0;
     int pos = 0;
     while ( true ) {
-        int next = pos==0 ? start:d_next[pos];
+        int next = pos == 0 ? start : d_next[pos];
         if ( next == -1 )
             break;
         pos = next;
         it++;
     }
-    pass = pass && it==d_N;
+    pass = pass && it == d_N;
     // Unlock the list and return the results
     unlock( 0, start );
     return pass;
 }
 
 
-
 /******************************************************************
-* MemoryPool                                                      *
-******************************************************************/
-template<class TYPE,class INT_TYPE>
-MemoryPool<TYPE,INT_TYPE>::MemoryPool( size_t size )
+ * MemoryPool                                                      *
+ ******************************************************************/
+template<class TYPE, class INT_TYPE>
+MemoryPool<TYPE, INT_TYPE>::MemoryPool( size_t size )
 {
-    static_assert( sizeof(TYPE) >= sizeof(int),
+    static_assert( sizeof( TYPE ) >= sizeof( int ),
         "sizeof(TYPE) must be >= sizeof(int) to ensure proper operation" );
-    static_assert( sizeof(TYPE) >= sizeof(INT_TYPE),
+    static_assert( sizeof( TYPE ) >= sizeof( INT_TYPE ),
         "sizeof(TYPE) must be >= sizeof(INT_TYPE) to ensure proper operation" );
-    d_objects = reinterpret_cast<TYPE*>( malloc(sizeof(TYPE)*size) );
-    d_next = 1;
-    for (size_t i=0; i<size; i++)
-        reinterpret_cast<volatile INT_TYPE&>(d_objects[i]) = i+1;
-    reinterpret_cast<volatile INT_TYPE&>(d_objects[size-1]) = -1;
+    d_objects = reinterpret_cast<TYPE *>( malloc( sizeof( TYPE ) * size ) );
+    d_next    = 1;
+    for ( size_t i = 0; i < size; i++ )
+        reinterpret_cast<volatile INT_TYPE &>( d_objects[i] ) = i + 1;
+    reinterpret_cast<volatile INT_TYPE &>( d_objects[size - 1] ) = -1;
 }
-template<class TYPE,class INT_TYPE>
-MemoryPool<TYPE,INT_TYPE>::~MemoryPool()
+template<class TYPE, class INT_TYPE>
+MemoryPool<TYPE, INT_TYPE>::~MemoryPool()
 {
-    free( const_cast<TYPE*>( d_objects ) );
+    free( const_cast<TYPE *>( d_objects ) );
     d_objects = nullptr;
 }
-template<class TYPE,class INT_TYPE>
-inline TYPE* MemoryPool<TYPE,INT_TYPE>::allocate()
+template<class TYPE, class INT_TYPE>
+inline TYPE *MemoryPool<TYPE, INT_TYPE>::allocate()
 {
     AtomicOperations::int32_atomic i = 0;
     while ( i == 0 )
         AtomicOperations::atomic_swap( &d_next, &i );
     TYPE *ptr = nullptr;
-    if ( i!=-1 ) {
-        INT_TYPE j = reinterpret_cast<volatile INT_TYPE&>(d_objects[i-1]);
-        ptr = const_cast<TYPE*>( &d_objects[i-1] );
-        new(ptr) TYPE();
-        i = j+1;
+    if ( i != -1 ) {
+        INT_TYPE j = reinterpret_cast<volatile INT_TYPE &>( d_objects[i - 1] );
+        ptr        = const_cast<TYPE *>( &d_objects[i - 1] );
+        new ( ptr ) TYPE();
+        i = j + 1;
     }
     AtomicOperations::atomic_fetch_and_or( &d_next, i );
     return ptr;
 }
-template<class TYPE,class INT_TYPE>
-inline void MemoryPool<TYPE,INT_TYPE>::free( TYPE* ptr )
+template<class TYPE, class INT_TYPE>
+inline void MemoryPool<TYPE, INT_TYPE>::free( TYPE *ptr )
 {
     ptr->~TYPE();
     AtomicOperations::int32_atomic i = 0;
     while ( i == 0 )
         AtomicOperations::atomic_swap( &d_next, &i );
-    reinterpret_cast<INT_TYPE&>(*ptr) = i-1;
-    i = ptr - d_objects + 1;
+    reinterpret_cast<INT_TYPE &>( *ptr ) = i - 1;
+    i                                    = ptr - d_objects + 1;
     AtomicOperations::atomic_fetch_and_or( &d_next, i );
 }
 
diff --git a/threadpool/test/test_atomic.cpp b/threadpool/test/test_atomic.cpp
index c3e0c5b0..27c76ee1 100644
--- a/threadpool/test/test_atomic.cpp
+++ b/threadpool/test/test_atomic.cpp
@@ -1,15 +1,15 @@
 #include "threadpool/atomic_helpers.h"
 #include "common/UnitTest.h"
 #include "common/Utilities.h"
-#include <iostream>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string>
-#include <vector>
-#include <thread>
-#include <chrono>
-#include <functional>
 #include <atomic>
+#include <chrono>
+#include <cstdio>
+#include <cstdlib>
+#include <functional>
+#include <iostream>
+#include <string>
+#include <thread>
+#include <vector>
 
 
 #define perr std::cerr
@@ -21,18 +21,18 @@
 static void modify_counter( int N, AtomicOperations::counter_t &counter )
 {
     if ( N > 0 ) {
-        for (int i=0; i<N; i++)
+        for ( int i = 0; i < N; i++ )
             counter.increment();
     } else if ( N < 0 ) {
-        for (int i=0; i<-N; i++)
+        for ( int i = 0; i < -N; i++ )
             counter.decrement();
     }
 }
 
 
 /******************************************************************
-* The main program                                                *
-******************************************************************/
+ * The main program                                                *
+ ******************************************************************/
 #ifdef USE_WINDOWS
 int __cdecl main( int, char ** )
 {
@@ -60,25 +60,25 @@ int main( int, char *[] )
     // Create the counter we want to test
     AtomicOperations::counter_t count;
     if ( count.increment() == 1 )
-        ut.passes("increment count");
+        ut.passes( "increment count" );
     else
-        ut.failure("increment count");
+        ut.failure( "increment count" );
     if ( count.decrement() == 0 )
-        ut.passes("decrement count");
+        ut.passes( "decrement count" );
     else
-        ut.failure("decrement count");
-    count.setCount(3);
+        ut.failure( "decrement count" );
+    count.setCount( 3 );
     if ( count.getCount() == 3 )
-        ut.passes("set count");
+        ut.passes( "set count" );
     else
-        ut.failure("set count");
-    count.setCount(0);
+        ut.failure( "set count" );
+    count.setCount( 0 );
 
     // Increment the counter in serial
     auto start = std::chrono::high_resolution_clock::now();
     modify_counter( N_count, count );
-    auto stop = std::chrono::high_resolution_clock::now();
-    double time_inc_serial = std::chrono::duration<double>(stop-start).count() / N_count;
+    auto stop              = std::chrono::high_resolution_clock::now();
+    double time_inc_serial = std::chrono::duration<double>( stop - start ).count() / N_count;
     int val                = count.getCount();
     if ( val != N_count ) {
         char tmp[100];
@@ -90,8 +90,8 @@ int main( int, char *[] )
     // Decrement the counter in serial
     start = std::chrono::high_resolution_clock::now();
     modify_counter( -N_count, count );
-    stop = std::chrono::high_resolution_clock::now();
-    double time_dec_serial = std::chrono::duration<double>(stop-start).count() / N_count;
+    stop                   = std::chrono::high_resolution_clock::now();
+    double time_dec_serial = std::chrono::duration<double>( stop - start ).count() / N_count;
     val                    = count.getCount();
     if ( val != 0 ) {
         char tmp[100];
@@ -104,12 +104,13 @@ int main( int, char *[] )
     std::vector<std::thread> threads( N_threads );
     start = std::chrono::high_resolution_clock::now();
     for ( int i = 0; i < N_threads; i++ )
-        threads[i] = std::thread( modify_counter, N_count, std::ref(count) );
+        threads[i] = std::thread( modify_counter, N_count, std::ref( count ) );
     for ( int i = 0; i < N_threads; i++ )
         threads[i].join();
     stop = std::chrono::high_resolution_clock::now();
-    double time_inc_parallel = std::chrono::duration<double>(stop-start).count() / ( N_count * N_threads );
-    val                      = count.getCount();
+    double time_inc_parallel =
+        std::chrono::duration<double>( stop - start ).count() / ( N_count * N_threads );
+    val = count.getCount();
     if ( val != N_count * N_threads ) {
         char tmp[100];
         sprintf( tmp, "Count of %i did not match expected count of %i", val, N_count * N_threads );
@@ -120,12 +121,13 @@ int main( int, char *[] )
     // Decrement the counter in parallel
     start = std::chrono::high_resolution_clock::now();
     for ( int i = 0; i < N_threads; i++ )
-        threads[i] = std::thread( modify_counter, -N_count, std::ref(count) );
+        threads[i] = std::thread( modify_counter, -N_count, std::ref( count ) );
     for ( int i = 0; i < N_threads; i++ )
         threads[i].join();
     stop = std::chrono::high_resolution_clock::now();
-    double time_dec_parallel = std::chrono::duration<double>(stop-start).count() / ( N_count * N_threads );
-    val                      = count.getCount();
+    double time_dec_parallel =
+        std::chrono::duration<double>( stop - start ).count() / ( N_count * N_threads );
+    val = count.getCount();
     if ( val != 0 ) {
         char tmp[100];
         sprintf( tmp, "Count of %i did not match expected count of %i", val, 0 );
@@ -147,6 +149,6 @@ int main( int, char *[] )
 
     // Finished
     ut.report();
-    int N_errors = static_cast<int>( ut.NumFailGlobal() );
+    auto N_errors = static_cast<int>( ut.NumFailGlobal() );
     return N_errors;
 }
diff --git a/threadpool/test/test_atomic_list.cpp b/threadpool/test/test_atomic_list.cpp
index 7d4aee16..4717dcc3 100644
--- a/threadpool/test/test_atomic_list.cpp
+++ b/threadpool/test/test_atomic_list.cpp
@@ -1,210 +1,221 @@
 #include "threadpool/atomic_list.h"
 #include "common/UnitTest.h"
 #include "common/Utilities.h"
-#include <iostream>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string>
-#include <vector>
-#include <thread>
-#include <chrono>
-#include <functional>
-#include <atomic>
 #include <algorithm>
+#include <atomic>
+#include <chrono>
+#include <cstdio>
+#include <cstdlib>
+#include <functional>
+#include <iostream>
+#include <string>
+#include <thread>
+#include <vector>
 
 
 
-static void modify_list( AtomicList<int,1024>& list )
+static void modify_list( AtomicList<int, 1024> &list )
 {
     const int N_count = 50000;
-    for (int i=0; i<N_count; i++) {
-        auto v1 = list.remove_first( );
-        auto v2 = list.remove( [](int) { return true; } );
-        auto v3 = list.remove( [](int v) { return v>=(rand()/8); } );
-        auto v4 = list.remove( [](int v) { return v>=(rand()/4); } );
-        auto v5 = list.remove( [](int v) { return v>=(rand()/2); } );
-        if ( v1 !=-1 ) { list.insert( v1 ); }
-        if ( v2 !=-1 ) { list.insert( v2 ); }
-        if ( v3 !=-1 ) { list.insert( v3 ); }
-        if ( v4 !=-1 ) { list.insert( v4 ); }
-        if ( v5 !=-1 ) { list.insert( v5 ); }
+    for ( int i = 0; i < N_count; i++ ) {
+        auto v1 = list.remove_first();
+        auto v2 = list.remove( []( int ) { return true; } );
+        auto v3 = list.remove( []( int v ) { return v >= ( rand() / 8 ); } );
+        auto v4 = list.remove( []( int v ) { return v >= ( rand() / 4 ); } );
+        auto v5 = list.remove( []( int v ) { return v >= ( rand() / 2 ); } );
+        if ( v1 != -1 ) {
+            list.insert( v1 );
+        }
+        if ( v2 != -1 ) {
+            list.insert( v2 );
+        }
+        if ( v3 != -1 ) {
+            list.insert( v3 );
+        }
+        if ( v4 != -1 ) {
+            list.insert( v4 );
+        }
+        if ( v5 != -1 ) {
+            list.insert( v5 );
+        }
     }
 }
 
 
-static bool check_list( const std::vector<int>& x, AtomicList<int,1024>& list )
+static bool check_list( const std::vector<int> &x, AtomicList<int, 1024> &list )
 {
     bool pass = list.check();
-    pass = pass && (int) x.size() == list.size();
+    pass      = pass && (int) x.size() == list.size();
     if ( pass ) {
-        for (size_t i=0; i<x.size(); i++)
-            pass = pass && x[i] == list.remove( [](int) { return true; } );
+        for ( int i : x )
+            pass = pass && i == list.remove( []( int ) { return true; } );
     }
     // Restore the list
-    for (int i=0; i<list.size(); i++)
+    for ( int i = 0; i < list.size(); i++ )
         list.remove_first();
-    for (size_t i=0; i<x.size(); i++)
-        list.insert( x[i] );
+    for ( int i : x )
+        list.insert( i );
     return pass;
 }
 
 
-static inline void clear_list(AtomicList<int,1024>& list )
+static inline void clear_list( AtomicList<int, 1024> &list )
 {
-    for (int i=0; i<list.size(); i++)
+    for ( int i = 0; i < list.size(); i++ )
         list.remove_first();
 }
 
 
-
 /******************************************************************
-* The main program                                                *
-******************************************************************/
+ * The main program                                                *
+ ******************************************************************/
 int main( int, char *[] )
 {
     UnitTest ut;
 
-    int N_threads = 8;      // Number of threads
+    int N_threads = 8; // Number of threads
 
     // Create the list
-    AtomicList<int,1024> list(-1);
-    if ( list.size()==0 && list.check() )
+    AtomicList<int, 1024> list( -1 );
+    if ( list.size() == 0 && list.check() )
         ut.passes( "Initialize" );
     else
         ut.failure( "Initialize" );
 
     // Initialize the list with some empty values
-    for (int i=0; i<80; i++)
+    for ( int i = 0; i < 80; i++ )
         list.insert( rand() );
     list.insert( 2 );
     list.insert( 1 );
     list.insert( rand() );
 
     // Try to pull off a couple of values
-    int v1 = list.remove( [](int a) { return a==1; } );    // Find the entry with 1
-    int v2 = list.remove( [](int) { return true; } );      // Get the first entry
-    int v3 = list.remove( [](int) { return false; } );     // Fail to get an entry
-    if ( v1==1 && v2==2 && v3==-1 && list.size()==81 && list.check() )
+    int v1 = list.remove( []( int a ) { return a == 1; } ); // Find the entry with 1
+    int v2 = list.remove( []( int ) { return true; } );     // Get the first entry
+    int v3 = list.remove( []( int ) { return false; } );    // Fail to get an entry
+    if ( v1 == 1 && v2 == 2 && v3 == -1 && list.size() == 81 && list.check() )
         ut.passes( "Basic sanity test" );
     else
         ut.failure( "Basic sanity test" );
 
     // Clear the list
-    while ( list.remove( [](int) { return true; } ) != -1 ) {}
+    while ( list.remove( []( int ) { return true; } ) != -1 ) {
+    }
 
     // Create a list of known values
-    //std::vector<int> data0(512);
-    std::vector<int> data0(5*N_threads);
-    for (size_t i=0; i<data0.size(); i++)
-        data0[i] = rand();
+    // std::vector<int> data0(512);
+    std::vector<int> data0( 5 * N_threads );
+    for ( int &i : data0 )
+        i = rand();
     auto data = data0;
     std::sort( data.begin(), data.end() );
 
     // Test the cost to insert
     int N_it = 20;
-    for (int i=0; i<list.size(); i++)
-        list.remove( [](int) { return true; } );
+    for ( int i = 0; i < list.size(); i++ )
+        list.remove( []( int ) { return true; } );
     std::chrono::duration<double> time;
     std::chrono::time_point<std::chrono::high_resolution_clock> start, stop;
     time = time.zero();
-    for (int it=0; it<N_it; it++ ) {
+    for ( int it = 0; it < N_it; it++ ) {
         clear_list( list );
         start = std::chrono::high_resolution_clock::now();
-        for (size_t i=0; i<data0.size(); i++)
-            list.insert( data0[i] );
+        for ( int i : data0 )
+            list.insert( i );
         stop = std::chrono::high_resolution_clock::now();
         time += ( stop - start );
     }
-    printf("insert time/item = %0.0f ns\n",1e9*time.count()/(N_it*data0.size()));
+    printf( "insert time/item = %0.0f ns\n", 1e9 * time.count() / ( N_it * data0.size() ) );
 
     // Test the cost to remove (first)
     time = time.zero();
-    for (int it=0; it<N_it; it++ ) {
+    for ( int it = 0; it < N_it; it++ ) {
         check_list( data, list );
         start = std::chrono::high_resolution_clock::now();
-        for (size_t i=0; i<data0.size(); i++)
-            list.remove_first( );
+        for ( size_t i = 0; i < data0.size(); i++ )
+            list.remove_first();
         stop = std::chrono::high_resolution_clock::now();
         time += ( stop - start );
     }
-    printf("remove (first) time/item = %0.0f ns\n",1e9*time.count()/(N_it*data0.size()));
+    printf( "remove (first) time/item = %0.0f ns\n", 1e9 * time.count() / ( N_it * data0.size() ) );
 
     // Test the cost to remove (in order)
     time = time.zero();
-    for (int it=0; it<N_it; it++ ) {
+    for ( int it = 0; it < N_it; it++ ) {
         check_list( data, list );
         start = std::chrono::high_resolution_clock::now();
-        for (size_t i=0; i<data0.size(); i++)
-            list.remove( [](int) { return true; } );
+        for ( size_t i = 0; i < data0.size(); i++ )
+            list.remove( []( int ) { return true; } );
         stop = std::chrono::high_resolution_clock::now();
         time += ( stop - start );
     }
-    printf("remove (ordered) time/item = %0.0f ns\n",1e9*time.count()/(N_it*data0.size()));
+    printf(
+        "remove (ordered) time/item = %0.0f ns\n", 1e9 * time.count() / ( N_it * data0.size() ) );
 
     // Test the cost to remove (out order)
     time = time.zero();
-    for (int it=0; it<N_it; it++ ) {
+    for ( int it = 0; it < N_it; it++ ) {
         check_list( data, list );
         start = std::chrono::high_resolution_clock::now();
-        for (size_t i=0; i<data0.size(); i++) {
-            int tmp = data0[i];
-            list.remove( [tmp](int v) { return v==tmp; } );
+        for ( int tmp : data0 ) {
+            list.remove( [tmp]( int v ) { return v == tmp; } );
         }
         stop = std::chrono::high_resolution_clock::now();
         time += ( stop - start );
     }
-    printf("remove (unordered) time/item = %0.0f ns\n",1e9*time.count()/(N_it*data0.size()));
+    printf(
+        "remove (unordered) time/item = %0.0f ns\n", 1e9 * time.count() / ( N_it * data0.size() ) );
 
     // Read/write to the list and check the results
     int64_t N0 = list.N_remove();
     check_list( data, list );
     start = std::chrono::high_resolution_clock::now();
     modify_list( list );
-    stop = std::chrono::high_resolution_clock::now();
-    double time_serial = std::chrono::duration<double>(stop-start).count();
-    int64_t N1 = list.N_remove();
-    bool pass = check_list( data, list );
+    stop               = std::chrono::high_resolution_clock::now();
+    double time_serial = std::chrono::duration<double>( stop - start ).count();
+    int64_t N1         = list.N_remove();
+    bool pass          = check_list( data, list );
     if ( pass )
         ut.passes( "Serial get/insert" );
     else
         ut.failure( "Serial get/insert" );
-    printf("serial time = %0.5f s\n",time_serial);
-    printf("serial time/item = %0.0f ns\n",1e9*time_serial/(N1-N0));
+    printf( "serial time = %0.5f s\n", time_serial );
+    printf( "serial time/item = %0.0f ns\n", 1e9 * time_serial / ( N1 - N0 ) );
 
     // Have multiple threads reading/writing to the list simultaneously
     std::vector<std::thread> threads( N_threads );
     start = std::chrono::high_resolution_clock::now();
     for ( int i = 0; i < N_threads; i++ )
-        threads[i] = std::thread( modify_list, std::ref(list) );
+        threads[i] = std::thread( modify_list, std::ref( list ) );
     for ( int i = 0; i < N_threads; i++ )
         threads[i].join();
-    stop = std::chrono::high_resolution_clock::now();
-    double time_parallel = std::chrono::duration<double>(stop-start).count();
-    int64_t N2 = list.N_remove();
-    pass = check_list( data, list );
+    stop                 = std::chrono::high_resolution_clock::now();
+    double time_parallel = std::chrono::duration<double>( stop - start ).count();
+    int64_t N2           = list.N_remove();
+    pass                 = check_list( data, list );
     if ( pass )
         ut.passes( "Parallel get/insert" );
     else
         ut.failure( "Parallel get/insert" );
-    printf("parallel time = %0.5f s\n",time_parallel);
-    printf("parallel time/item = %0.0f ns\n",1e9*time_parallel/(N2-N1));
+    printf( "parallel time = %0.5f s\n", time_parallel );
+    printf( "parallel time/item = %0.0f ns\n", 1e9 * time_parallel / ( N2 - N1 ) );
 
     // Try to over-fill the list
     while ( !list.empty() )
         list.remove_first();
-    for (int i=1; i<=list.capacity(); i++)
+    for ( int i = 1; i <= list.capacity(); i++ )
         list.insert( i );
     try {
-        list.insert( list.capacity()+1 );
+        list.insert( list.capacity() + 1 );
         ut.failure( "List overflow" );
-    } catch (const std::exception& e) {
+    } catch ( const std::exception &e ) {
         ut.passes( "List overflow" );
-    } catch(...) {
+    } catch ( ... ) {
         ut.failure( "List overflow (unknown exception)" );
     }
 
     // Finished
     ut.report();
-    int N_errors = static_cast<int>( ut.NumFailGlobal() );
+    auto N_errors = static_cast<int>( ut.NumFailGlobal() );
     return N_errors;
 }
diff --git a/threadpool/test/test_thread_pool.cpp b/threadpool/test/test_thread_pool.cpp
index 1fd0ae63..b7168f4b 100644
--- a/threadpool/test/test_thread_pool.cpp
+++ b/threadpool/test/test_thread_pool.cpp
@@ -5,15 +5,15 @@
 #include "threadpool/thread_pool.h"
 #include "common/UnitTest.h"
 #include "common/Utilities.h"
-#include <math.h>
 #include <algorithm>
+#include <cmath>
+#include <cstdio>
+#include <cstdlib>
 #include <iostream>
+#include <mutex>
 #include <stdexcept>
-#include <stdio.h>
-#include <stdlib.h>
 #include <string>
 #include <vector>
-#include <mutex>
 
 
 #define MAX( x, y ) ( ( x ) > ( y ) ? ( x ) : ( y ) )
@@ -28,8 +28,8 @@
 #include "mpi.h"
 #endif
 
-#define to_ns(x) std::chrono::duration_cast<std::chrono::nanoseconds>(x).count()
-#define to_ms(x) std::chrono::duration_cast<std::chrono::milliseconds>(x).count()
+#define to_ns( x ) std::chrono::duration_cast<std::chrono::nanoseconds>( x ).count()
+#define to_ms( x ) std::chrono::duration_cast<std::chrono::milliseconds>( x ).count()
 
 
 // Wrapper functions for mpi
@@ -82,18 +82,17 @@ void waste_cpu( int N )
 // Sleep for the given time
 // Note: since we may encounter interrupts, we may not sleep for the desired time
 //   so we need to perform the sleep in a loop
-void sleep_ms( int64_t N ) {
+void sleep_ms( int64_t N )
+{
     auto t1 = std::chrono::high_resolution_clock::now();
     auto t2 = std::chrono::high_resolution_clock::now();
-    while ( to_ms(t2-t1) < N ) {
-        int N2 = N - to_ms(t2-t1);
-        std::this_thread::sleep_for( std::chrono::milliseconds(N2) );
+    while ( to_ms( t2 - t1 ) < N ) {
+        int N2 = N - to_ms( t2 - t1 );
+        std::this_thread::sleep_for( std::chrono::milliseconds( N2 ) );
         t2 = std::chrono::high_resolution_clock::now();
     }
 }
-void sleep_s( int N ) {
-    sleep_ms(1000*N);
-}
+void sleep_s( int N ) { sleep_ms( 1000 * N ); }
 
 
 // Function to sleep for N seconds then increment a global count
@@ -133,9 +132,9 @@ void print_processor( ThreadPool *tpool )
     int processor = ThreadPool::getCurrentProcessor();
     char tmp[100];
     sprintf( tmp, "%i:  Thread,proc = %i,%i\n", rank, thread, processor );
-    sleep_ms( 10*rank );
+    sleep_ms( 10 * rank );
     print_processor_mutex.lock();
-    std::cout << tmp;
+    pout << tmp;
     print_processor_mutex.unlock();
     sleep_ms( 100 );
 }
@@ -161,7 +160,9 @@ int test_member_thread( ThreadPool *tpool )
 }
 
 
-// Functions to test the templates
+/******************************************************************
+ * Test the TPOOL_ADD_WORK macro with variable number of arguments *
+ ******************************************************************/
 static int myfun0() { return 0; }
 static int myfun1( int ) { return 1; }
 static int myfun2( int, float ) { return 2; }
@@ -170,60 +171,6 @@ static int myfun4( int, float, double, char ) { return 4; }
 static int myfun5( int, float, double, char, std::string ) { return 5; }
 static int myfun6( int, float, double, char, std::string, int ) { return 6; }
 static int myfun7( int, float, double, char, std::string, int, int ) { return 7; }
-
-
-// Function to test instantiation of functions with different number of arguments
-// clang-format off
-static void vfunarg00() {}
-static void vfunarg01( int ) {}
-static void vfunarg02( int, char ) {}
-static void vfunarg03( int, char, double ) {}
-static void vfunarg04( int, char, double, int ) {}
-static void vfunarg05( int, char, double, int, char ) {}
-static void vfunarg06( int, char, double, int, char, double ) {}
-static void vfunarg07( int, char, double, int, char, double, int ) {}
-static void vfunarg08( int, char, double, int, char, double, int, char ) {}
-static void vfunarg09( int, char, double, int, char, double, int, char, double ) {}
-static void vfunarg10( int, char, double, int, char, double, int, char, double, int ) {}
-static void vfunarg11( int, char, double, int, char, double, int, char, double, int, char ) {}
-static void vfunarg12( int, char, double, int, char, double, int, char, double, int, char, double ) {}
-static void vfunarg13( int, char, double, int, char, double, int, char, double, int, char, double, int ) {}
-static void vfunarg14( int, char, double, int, char, double, int, char, double, int, char, double, int, char ) {}
-static void vfunarg15( int, char, double, int, char, double, int, char, double, int, char, double, int, char, double ) {}
-static void vfunarg16( int, char, double, int, char, double, int, char, double, int, char, double, int, char, double, int ) {}
-static void vfunarg17( int, char, double, int, char, double, int, char, double, int, char, double, int, char, double, int, char ) {}
-static void vfunarg18( int, char, double, int, char, double, int, char, double, int, char, double, int, char, double, int, char, double ) {}
-static void vfunarg19( int, char, double, int, char, double, int, char, double, int, char, double, int, char, double, int, char, double, int ) {}
-static void vfunarg20( int, char, double, int, char, double, int, char, double, int, char, double, int, char, double, int, char, double, int, char ) {}
-static void vfunarg21( int, char, double, int, char, double, int, char, double, int, char, double, int, char, double, int, char, double, int, char, double ) {}
-static void vfunarg22( int, char, double, int, char, double, int, char, double, int, char, double, int, char, double, int, char, double, int, char, double, int ) {}
-static void vfunarg23( int, char, double, int, char, double, int, char, double, int, char, double, int, char, double, int, char, double, int, char, double, int, char ) {}
-static void vfunarg24( int, char, double, int, char, double, int, char, double, int, char, double, int, char, double, int, char, double, int, char, double, int, char, double ) {}
-static int funarg00() { return 0; }
-static int funarg01( int ) { return 1; }
-static int funarg02( int, char ) { return 2; }
-static int funarg03( int, char, double ) { return 3; }
-static int funarg04( int, char, double, int ) { return 4; }
-static int funarg05( int, char, double, int, char ) { return 5; }
-static int funarg06( int, char, double, int, char, double ) { return 6; }
-static int funarg07( int, char, double, int, char, double, int ) { return 7; }
-static int funarg08( int, char, double, int, char, double, int, char ) { return 8; }
-static int funarg09( int, char, double, int, char, double, int, char, double ) { return 9; }
-static int funarg10( int, char, double, int, char, double, int, char, double, int ) { return 10; }
-static int funarg11( int, char, double, int, char, double, int, char, double, int, char ) { return 11; }
-static int funarg12( int, char, double, int, char, double, int, char, double, int, char, double ) { return 12; }
-static int funarg13( int, char, double, int, char, double, int, char, double, int, char, double, int ) { return 13; }
-static int funarg14( int, char, double, int, char, double, int, char, double, int, char, double, int, char ) { return 14; }
-static int funarg15( int, char, double, int, char, double, int, char, double, int, char, double, int, char, double ) { return 15; }
-static int funarg16( int, char, double, int, char, double, int, char, double, int, char, double, int, char, double, int ) { return 16; }
-static int funarg17( int, char, double, int, char, double, int, char, double, int, char, double, int, char, double, int, char ) { return 17; }
-static int funarg18( int, char, double, int, char, double, int, char, double, int, char, double, int, char, double, int, char, double ) { return 18; }
-static int funarg19( int, char, double, int, char, double, int, char, double, int, char, double, int, char, double, int, char, double, int ) { return 19; }
-static int funarg20( int, char, double, int, char, double, int, char, double, int, char, double, int, char, double, int, char, double, int, char ) { return 20; }
-static int funarg21( int, char, double, int, char, double, int, char, double, int, char, double, int, char, double, int, char, double, int, char, double ) { return 21; }
-static int funarg22( int, char, double, int, char, double, int, char, double, int, char, double, int, char, double, int, char, double, int, char, double, int ) { return 22; }
-static int funarg23( int, char, double, int, char, double, int, char, double, int, char, double, int, char, double, int, char, double, int, char, double, int, char ) { return 23; }
-static int funarg24( int, char, double, int, char, double, int, char, double, int, char, double, int, char, double, int, char, double, int, char, double, int, char, double ) { return 24; }
 static int test_function_arguements( ThreadPool *tpool )
 {
     int N_errors = 0;
@@ -231,88 +178,56 @@ static int test_function_arguements( ThreadPool *tpool )
     ThreadPool::thread_id_t id0 = TPOOL_ADD_WORK( tpool, myfun0, ( nullptr ) );
     ThreadPool::thread_id_t id1 = TPOOL_ADD_WORK( tpool, myfun1, ( (int) 1 ) );
     ThreadPool::thread_id_t id2 = TPOOL_ADD_WORK( tpool, myfun2, ( (int) 1, (float) 2 ) );
-    ThreadPool::thread_id_t id3 = TPOOL_ADD_WORK( tpool, myfun3, ( (int) 1, (float) 2, (double) 3 ) );
-    ThreadPool::thread_id_t id4 = TPOOL_ADD_WORK( tpool, myfun4, ( (int) 1, (float) 2, (double) 3, (char) 4 ) );
-    ThreadPool::thread_id_t id5 = TPOOL_ADD_WORK( tpool, myfun5, ( (int) 1, (float) 2, (double) 3, (char) 4, std::string( "test" ) ) );
-    ThreadPool::thread_id_t id52= TPOOL_ADD_WORK( tpool, myfun5, ( (int) 1, (float) 2, (double) 3, (char) 4, std::string( "test" ) ), -1 );
-    ThreadPool::thread_id_t id6 = TPOOL_ADD_WORK( tpool, myfun6, ( (int) 1, (float) 2, (double) 3, (char) 4, std::string( "test" ), (int) 1 ) );
-    ThreadPool::thread_id_t id7 = TPOOL_ADD_WORK( tpool, myfun7, ( (int) 1, (float) 2, (double) 3, (char) 4, std::string( "test" ), (int) 1, (int) 1 ) );
+    ThreadPool::thread_id_t id3 =
+        TPOOL_ADD_WORK( tpool, myfun3, ( (int) 1, (float) 2, (double) 3 ) );
+    ThreadPool::thread_id_t id4 =
+        TPOOL_ADD_WORK( tpool, myfun4, ( (int) 1, (float) 2, (double) 3, (char) 4 ) );
+    ThreadPool::thread_id_t id5 = TPOOL_ADD_WORK(
+        tpool, myfun5, ( (int) 1, (float) 2, (double) 3, (char) 4, std::string( "test" ) ) );
+    ThreadPool::thread_id_t id52 = TPOOL_ADD_WORK(
+        tpool, myfun5, ( (int) 1, (float) 2, (double) 3, (char) 4, std::string( "test" ) ), -1 );
+    ThreadPool::thread_id_t id6 = TPOOL_ADD_WORK( tpool, myfun6,
+        ( (int) 1, (float) 2, (double) 3, (char) 4, std::string( "test" ), (int) 1 ) );
+    ThreadPool::thread_id_t id7 = TPOOL_ADD_WORK( tpool, myfun7,
+        ( (int) 1, (float) 2, (double) 3, (char) 4, std::string( "test" ), (int) 1, (int) 1 ) );
     tpool->wait_pool_finished();
-    if ( !tpool->isFinished( id0 ) ) { N_errors++; }
-    if ( tpool->getFunctionRet<int>( id0 ) != 0 ) { N_errors++; }
-    if ( tpool->getFunctionRet<int>( id1 ) != 1 ) { N_errors++; }
-    if ( tpool->getFunctionRet<int>( id2 ) != 2 ) { N_errors++; }
-    if ( tpool->getFunctionRet<int>( id3 ) != 3 ) { N_errors++; }
-    if ( tpool->getFunctionRet<int>( id4 ) != 4 ) { N_errors++; }
-    if ( tpool->getFunctionRet<int>( id5 ) != 5 ) { N_errors++; }
-    if ( tpool->getFunctionRet<int>( id52 ) != 5 ){ N_errors++; }
-    if ( tpool->getFunctionRet<int>( id6 ) != 6 ) { N_errors++; }
-    if ( tpool->getFunctionRet<int>( id7 ) != 7 ) { N_errors++; }
-    // Test all the different numbers of arguments allowed
-    TPOOL_ADD_WORK( tpool, vfunarg00, ( nullptr ) );
-    TPOOL_ADD_WORK( tpool, vfunarg01, ( 1 ) );
-    TPOOL_ADD_WORK( tpool, vfunarg02, ( 1, 'a' ) );
-    TPOOL_ADD_WORK( tpool, vfunarg03, ( 1, 'a', 3.0 ) );
-    TPOOL_ADD_WORK( tpool, vfunarg04, ( 1, 'a', 3.0, 4 ) );
-    TPOOL_ADD_WORK( tpool, vfunarg05, ( 1, 'a', 3.0, 4, 'e' ) );
-    TPOOL_ADD_WORK( tpool, vfunarg06, ( 1, 'a', 3.0, 4, 'e', 6.0 ) );
-    TPOOL_ADD_WORK( tpool, vfunarg07, ( 1, 'a', 3.0, 4, 'e', 6.0, 7 ) );
-    TPOOL_ADD_WORK( tpool, vfunarg08, ( 1, 'a', 3.0, 4, 'e', 6.0, 7, 'h' ) );
-    TPOOL_ADD_WORK( tpool, vfunarg09, ( 1, 'a', 3.0, 4, 'e', 6.0, 7, 'h', 9.0 ) );
-    TPOOL_ADD_WORK( tpool, vfunarg10, ( 1, 'a', 3.0, 4, 'e', 6.0, 7, 'h', 9.0, 10 ) );
-    TPOOL_ADD_WORK( tpool, vfunarg11, ( 1, 'a', 3.0, 4, 'e', 6.0, 7, 'h', 9.0, 10, 'k' ) );
-    TPOOL_ADD_WORK( tpool, vfunarg12, ( 1, 'a', 3.0, 4, 'e', 6.0, 7, 'h', 9.0, 10, 'k', 12.0 ) );
-    TPOOL_ADD_WORK( tpool, vfunarg13, ( 1, 'a', 3.0, 4, 'e', 6.0, 7, 'h', 9.0, 10, 'k', 12.0, 13 ) );
-    TPOOL_ADD_WORK( tpool, vfunarg14, ( 1, 'a', 3.0, 4, 'e', 6.0, 7, 'h', 9.0, 10, 'k', 12.0, 13, 'n' ) );
-    TPOOL_ADD_WORK( tpool, vfunarg15, ( 1, 'a', 3.0, 4, 'e', 6.0, 7, 'h', 9.0, 10, 'k', 12.0, 13, 'n', 15.0 ) );
-    TPOOL_ADD_WORK( tpool, vfunarg16, ( 1, 'a', 3.0, 4, 'e', 6.0, 7, 'h', 9.0, 10, 'k', 12.0, 13, 'n', 15.0, 16 ) );
-    TPOOL_ADD_WORK( tpool, vfunarg17, ( 1, 'a', 3.0, 4, 'e', 6.0, 7, 'h', 9.0, 10, 'k', 12.0, 13, 'n', 15.0, 16, 'q' ) );
-    TPOOL_ADD_WORK( tpool, vfunarg18, ( 1, 'a', 3.0, 4, 'e', 6.0, 7, 'h', 9.0, 10, 'k', 12.0, 13, 'n', 15.0, 16, 'q', 18.0 ) );
-    TPOOL_ADD_WORK( tpool, vfunarg19, ( 1, 'a', 3.0, 4, 'e', 6.0, 7, 'h', 9.0, 10, 'k', 12.0, 13, 'n', 15.0, 16, 'q', 18.0, 19 ) );
-    TPOOL_ADD_WORK( tpool, vfunarg20, ( 1, 'a', 3.0, 4, 'e', 6.0, 7, 'h', 9.0, 10, 'k', 12.0, 13, 'n', 15.0, 16, 'q', 18.0, 19, 't' ) );
-    TPOOL_ADD_WORK( tpool, vfunarg21, ( 1, 'a', 3.0, 4, 'e', 6.0, 7, 'h', 9.0, 10, 'k', 12.0, 13, 'n', 15.0, 16, 'q', 18.0, 19, 't', 21.0 ) );
-    TPOOL_ADD_WORK( tpool, vfunarg22, ( 1, 'a', 3.0, 4, 'e', 6.0, 7, 'h', 9.0, 10, 'k', 12.0, 13, 'n', 15.0, 16, 'q', 18.0, 19, 't', 21.0, 22 ) );
-    TPOOL_ADD_WORK( tpool, vfunarg23, ( 1, 'a', 3.0, 4, 'e', 6.0, 7, 'h', 9.0, 10, 'k', 12.0, 13, 'n', 15.0, 16, 'q', 18.0, 19, 't', 21.0, 22, 'w' ) );
-    TPOOL_ADD_WORK( tpool, vfunarg24, ( 1, 'a', 3.0, 4, 'e', 6.0, 7, 'h', 9.0, 10, 'k', 12.0, 13, 'n', 15.0, 16, 'q', 18.0, 19, 't', 21.0, 22, 'w', 24.0 ) );
-    std::vector<ThreadPool::thread_id_t> ids( 25 );
-    ids[0]  = TPOOL_ADD_WORK( tpool, funarg00, ( nullptr ) );
-    ids[1]  = TPOOL_ADD_WORK( tpool, funarg01, ( 1 ) );
-    ids[2]  = TPOOL_ADD_WORK( tpool, funarg02, ( 1, 'a' ) );
-    ids[3]  = TPOOL_ADD_WORK( tpool, funarg03, ( 1, 'a', 3.0 ) );
-    ids[4]  = TPOOL_ADD_WORK( tpool, funarg04, ( 1, 'a', 3.0, 4 ) );
-    ids[5]  = TPOOL_ADD_WORK( tpool, funarg05, ( 1, 'a', 3.0, 4, 'e' ) );
-    ids[6]  = TPOOL_ADD_WORK( tpool, funarg06, ( 1, 'a', 3.0, 4, 'e', 6.0 ) );
-    ids[7]  = TPOOL_ADD_WORK( tpool, funarg07, ( 1, 'a', 3.0, 4, 'e', 6.0, 7 ) );
-    ids[8]  = TPOOL_ADD_WORK( tpool, funarg08, ( 1, 'a', 3.0, 4, 'e', 6.0, 7, 'h' ) );
-    ids[9]  = TPOOL_ADD_WORK( tpool, funarg09, ( 1, 'a', 3.0, 4, 'e', 6.0, 7, 'h', 9.0 ) );
-    ids[10] = TPOOL_ADD_WORK( tpool, funarg10, ( 1, 'a', 3.0, 4, 'e', 6.0, 7, 'h', 9.0, 10 ) );
-    ids[11] = TPOOL_ADD_WORK( tpool, funarg11, ( 1, 'a', 3.0, 4, 'e', 6.0, 7, 'h', 9.0, 10, 'k' ) );
-    ids[12] = TPOOL_ADD_WORK( tpool, funarg12, ( 1, 'a', 3.0, 4, 'e', 6.0, 7, 'h', 9.0, 10, 'k', 12.0 ) );
-    ids[13] = TPOOL_ADD_WORK( tpool, funarg13, ( 1, 'a', 3.0, 4, 'e', 6.0, 7, 'h', 9.0, 10, 'k', 12.0, 13 ) );
-    ids[14] = TPOOL_ADD_WORK( tpool, funarg14, ( 1, 'a', 3.0, 4, 'e', 6.0, 7, 'h', 9.0, 10, 'k', 12.0, 13, 'h' ) );
-    ids[15] = TPOOL_ADD_WORK( tpool, funarg15, ( 1, 'a', 3.0, 4, 'e', 6.0, 7, 'h', 9.0, 10, 'k', 12.0, 13, 'h', 15.0 ) );
-    ids[16] = TPOOL_ADD_WORK( tpool, funarg16, ( 1, 'a', 3.0, 4, 'e', 6.0, 7, 'h', 9.0, 10, 'k', 12.0, 13, 'n', 15.0, 16 ) );
-    ids[17] = TPOOL_ADD_WORK( tpool, funarg17, ( 1, 'a', 3.0, 4, 'e', 6.0, 7, 'h', 9.0, 10, 'k', 12.0, 13, 'n', 15.0, 16, 'q' ) );
-    ids[18] = TPOOL_ADD_WORK( tpool, funarg18, ( 1, 'a', 3.0, 4, 'e', 6.0, 7, 'h', 9.0, 10, 'k', 12.0, 13, 'n', 15.0, 16, 'q', 18.0 ) );
-    ids[19] = TPOOL_ADD_WORK( tpool, funarg19, ( 1, 'a', 3.0, 4, 'e', 6.0, 7, 'h', 9.0, 10, 'k', 12.0, 13, 'n', 15.0, 16, 'q', 18.0, 19 ) );
-    ids[20] = TPOOL_ADD_WORK( tpool, funarg20, ( 1, 'a', 3.0, 4, 'e', 6.0, 7, 'h', 9.0, 10, 'k', 12.0, 13, 'n', 15.0, 16, 'q', 18.0, 19, 't' ) );
-    ids[21] = TPOOL_ADD_WORK( tpool, funarg21, ( 1, 'a', 3.0, 4, 'e', 6.0, 7, 'h', 9.0, 10, 'k', 12.0, 13, 'n', 15.0, 16, 'q', 18.0, 19, 't', 21.0 ) );
-    ids[22] = TPOOL_ADD_WORK( tpool, funarg22, ( 1, 'a', 3.0, 4, 'e', 6.0, 7, 'h', 9.0, 10, 'k', 12.0, 13, 'n', 15.0, 16, 'q', 18.0, 19, 't', 21.0, 22 ) );
-    ids[23] = TPOOL_ADD_WORK( tpool, funarg23, ( 1, 'a', 3.0, 4, 'e', 6.0, 7, 'h', 9.0, 10, 'k', 12.0, 13, 'n', 15.0, 16, 'q', 18.0, 19, 't', 21.0, 22, 'w' ) );
-    ids[24] = TPOOL_ADD_WORK( tpool, funarg24, ( 1, 'a', 3.0, 4, 'e', 6.0, 7, 'h', 9.0, 10, 'k', 12.0, 13, 'n', 15.0, 16, 'q', 18.0, 19, 't', 21.0, 22, 'w', 24.0 ) );
-    tpool->wait_all( ids );
-    for ( size_t i = 0; i < ids.size(); i++ ) {
-        if ( tpool->getFunctionRet<int>( ids[i] ) != static_cast<int>( i ) )
-            N_errors++;
+    if ( !tpool->isFinished( id0 ) ) {
+        N_errors++;
+    }
+    if ( tpool->getFunctionRet<int>( id0 ) != 0 ) {
+        N_errors++;
+    }
+    if ( tpool->getFunctionRet<int>( id1 ) != 1 ) {
+        N_errors++;
+    }
+    if ( tpool->getFunctionRet<int>( id2 ) != 2 ) {
+        N_errors++;
+    }
+    if ( tpool->getFunctionRet<int>( id3 ) != 3 ) {
+        N_errors++;
+    }
+    if ( tpool->getFunctionRet<int>( id4 ) != 4 ) {
+        N_errors++;
+    }
+    if ( tpool->getFunctionRet<int>( id5 ) != 5 ) {
+        N_errors++;
+    }
+    if ( tpool->getFunctionRet<int>( id52 ) != 5 ) {
+        N_errors++;
+    }
+    if ( tpool->getFunctionRet<int>( id6 ) != 6 ) {
+        N_errors++;
+    }
+    if ( tpool->getFunctionRet<int>( id7 ) != 7 ) {
+        N_errors++;
     }
     return N_errors;
 }
-// clang-format on
 
 
 /******************************************************************
-* Examples to derive a user work item                             *
-******************************************************************/
+ * Examples to derive a user work item                             *
+ ******************************************************************/
 class UserWorkItemVoid : public ThreadPool::WorkItem
 {
 public:
@@ -323,15 +238,15 @@ public:
         NULL_USE( dummy );
     }
     // User defined run (can do anything)
-    virtual void run() override
+    void run() override
     {
         // Perform the tasks
         printf( "Hello work from UserWorkItem (void)" );
     }
     // Will the routine return a result
-    virtual bool has_result() const override { return false; }
+    bool has_result() const override { return false; }
     // User defined destructor
-    virtual ~UserWorkItemVoid() {}
+    ~UserWorkItemVoid() override = default;
 };
 class UserWorkItemInt : public ThreadPool::WorkItemRet<int>
 {
@@ -343,38 +258,31 @@ public:
         NULL_USE( dummy );
     }
     // User defined run (can do anything)
-    virtual void run() override
+    void run() override
     {
         // Perform the tasks
         printf( "Hello work from UserWorkItem (int)" );
         // Store the results (it's type will match the template)
         ThreadPool::WorkItemRet<int>::d_result = 1;
     }
-    // Will the routine return a result
-    virtual bool has_result() const override { return false; }
     // User defined destructor
-    virtual ~UserWorkItemInt() {}
+    ~UserWorkItemInt() override = default;
 };
 
 
 /******************************************************************
-* test the time to run N tasks in parallel                        *
-******************************************************************/
-inline double run_parallel( ThreadPool *tpool, int N_tasks, int N_work )
+ * test the time to run N tasks in parallel                        *
+ ******************************************************************/
+template<class Ret, class... Args>
+inline double launchAndTime( ThreadPool &tpool, int N, Ret ( *routine )( Args... ), Args... args )
 {
-    // Make sure the thread pool is empty
-    tpool->wait_pool_finished();
-    // Add the work
-    std::vector<ThreadPool::thread_id_t> ids;
-    ids.reserve( N_tasks );
+    tpool.wait_pool_finished();
     auto start = std::chrono::high_resolution_clock::now();
-    for ( int i = 0; i < N_tasks; i++ )
-        ids.push_back( TPOOL_ADD_WORK( tpool, waste_cpu, ( N_work ) ) );
-    // Wait for the thread pool to finish
-    tpool->wait_pool_finished();
-    // Compute the time spent running the tasks
+    for ( int i = 0; i < N; i++ )
+        ThreadPool_add_work( &tpool, 0, routine, args... );
+    tpool.wait_pool_finished();
     auto stop = std::chrono::high_resolution_clock::now();
-    return std::chrono::duration<double>(stop-start).count();
+    return std::chrono::duration<double>( stop - start ).count();
 }
 
 
@@ -384,8 +292,8 @@ ThreadPool::thread_id_t f2( ThreadPool::thread_id_t a ) { return a; }
 
 
 /******************************************************************
-* Test the basic functionallity of the atomics                    *
-******************************************************************/
+ * Test the basic functionallity of the atomics                    *
+ ******************************************************************/
 int test_atomics()
 {
     using namespace AtomicOperations;
@@ -411,33 +319,35 @@ int test_atomics()
 
 
 /******************************************************************
-* Test FIFO behavior                                              *
-******************************************************************/
-void test_FIFO( UnitTest& ut, ThreadPool& tpool )
+ * Test FIFO behavior                                              *
+ ******************************************************************/
+void test_FIFO( UnitTest &ut, ThreadPool &tpool )
 {
-    int rank = getRank();
-    int size = getSize();
-    for (int r=0; r<size; r++) {
+    int rank    = getRank();
+    int size    = getSize();
+    const int N = 4000;
+    for ( int r = 0; r < size; r++ ) {
         barrier();
         if ( r != rank )
-            continue;   
+            continue;
         std::vector<ThreadPool::thread_id_t> ids;
-        for (size_t i=0; i<4000; i++)
-            ids.push_back( TPOOL_ADD_WORK( &tpool, sleep_inc2, ( 0.001 ) ) );
+        ids.reserve( N );
+        for ( size_t i = 0; i < N; i++ )
+            ids.emplace_back( TPOOL_ADD_WORK( &tpool, sleep_inc2, ( 0.001 ) ) );
         bool pass = true;
         while ( tpool.N_queued() > 0 ) {
-            int i1=-1, i2=ids.size();
-            for (size_t i=0; i<ids.size(); i++) {
+            int i1 = -1, i2 = ids.size();
+            for ( int i = N - 1; i >= 0; i-- ) {
                 bool started = ids[i].started();
                 if ( started )
-                    i1 = std::max<int>(i1,i);   // Last index to processing item
+                    i1 = std::max<int>( i1, i ); // Last index to processing item
                 else
-                    i2 = std::min<int>(i2,i);   // First index to queued item
+                    i2 = std::min<int>( i2, i ); // First index to queued item
             }
-            int diff = i1==-1 ? 0:(i2-i1-1);
-            if ( abs(diff)>4 ) {
-                printf("%i %i %i\n",i1,i2,diff);
-                pass = pass && abs(i2-i1-1)<=2;
+            int diff = i1 == -1 ? 0 : ( i2 - i1 - 1 );
+            if ( abs( diff ) > 4 ) {
+                printf( "%i %i %i\n", i1, i2, diff );
+                pass = pass && abs( i2 - i1 - 1 ) <= 2;
             }
         }
         ids.clear();
@@ -451,8 +361,8 @@ void test_FIFO( UnitTest& ut, ThreadPool& tpool )
 
 
 /******************************************************************
-* The main program                                                *
-******************************************************************/
+ * The main program                                                *
+ ******************************************************************/
 #ifdef USE_WINDOWS
 int __cdecl main( int argc, char **argv )
 {
@@ -510,11 +420,7 @@ int main( int argc, char *argv[] )
 
     // Get the number of processors availible
     barrier();
-    int N_procs = 0;
-    try {
-        N_procs = ThreadPool::getNumberOfProcessors();
-    } catch ( ... ) {
-    }
+    int N_procs = ThreadPool::getNumberOfProcessors();
     if ( N_procs > 0 )
         ut.passes( "getNumberOfProcessors" );
     else
@@ -524,15 +430,11 @@ int main( int argc, char *argv[] )
 
     // Get the processor affinities for the process
     barrier();
-    std::vector<int> cpus;
-    try {
-        cpus = ThreadPool::getProcessAffinity();
-        printp( "%i cpus for current process: ", (int) cpus.size() );
-        for ( size_t i = 0; i < cpus.size(); i++ )
-            printp( "%i ", cpus[i] );
-        printp( "\n" );
-    } catch ( ... ) {
-    }
+    std::vector<int> cpus = ThreadPool::getProcessAffinity();
+    printp( "%i cpus for current process: ", (int) cpus.size() );
+    for ( int cpu : cpus )
+        printp( "%i ", cpu );
+    printp( "\n" );
     if ( !cpus.empty() ) {
         ut.passes( "getProcessAffinity" );
     } else {
@@ -559,8 +461,8 @@ int main( int argc, char *argv[] )
             cpus                  = ThreadPool::getProcessAffinity();
             std::vector<int> cpus = ThreadPool::getProcessAffinity();
             printp( "%i cpus for current process (updated): ", (int) cpus.size() );
-            for ( size_t i = 0; i < cpus.size(); i++ )
-                printp( "%i ", cpus[i] );
+            for ( int cpu : cpus )
+                printp( "%i ", cpu );
             printp( "\n" );
             pass = cpus.size() > 1;
         } else {
@@ -630,8 +532,8 @@ int main( int argc, char *argv[] )
             std::vector<int> procs_thread = tpool.getThreadAffinity( i );
             if ( procs_thread != procs ) {
                 printp( "%i: Initial thread affinity: ", rank );
-                for ( size_t i = 0; i < procs_thread.size(); i++ )
-                    printp( "%i ", procs_thread[i] );
+                for ( int i : procs_thread )
+                    printp( "%i ", i );
                 printp( "\n" );
                 pass = false;
             }
@@ -646,15 +548,15 @@ int main( int argc, char *argv[] )
             int N_procs_thread = std::max<int>( (int) cpus.size() / N_threads, 1 );
             for ( int i = 0; i < N_threads; i++ ) {
                 std::vector<int> procs_thread( N_procs_thread, -1 );
-                for ( int j         = 0; j < N_procs_thread; j++ )
+                for ( int j = 0; j < N_procs_thread; j++ )
                     procs_thread[j] = procs[( i * N_procs_thread + j ) % procs.size()];
                 tpool.setThreadAffinity( i, procs_thread );
                 sleep_ms( 10 ); // Give time for OS to update thread affinities
                 std::vector<int> procs_thread2 = tpool.getThreadAffinity( i );
                 if ( procs_thread2 != procs_thread ) {
                     printp( "%i: Final thread affinity: ", rank );
-                    for ( size_t i = 0; i < procs_thread.size(); i++ )
-                        printp( "%i ", procs_thread[i] );
+                    for ( int i : procs_thread )
+                        printp( "%i ", i );
                     printp( "\n" );
                     pass = false;
                 }
@@ -674,8 +576,8 @@ int main( int argc, char *argv[] )
     for ( int i = 0; i < N_threads; i++ ) {
         std::vector<int> procs_thread = tpool.getThreadAffinity( i );
         printp( "Thread affinity: " );
-        for ( size_t i = 0; i < procs_thread.size(); i++ )
-            printp( "%i ", procs_thread[i] );
+        for ( int i : procs_thread )
+            printp( "%i ", i );
         printp( "\n" );
     }
 
@@ -683,9 +585,7 @@ int main( int argc, char *argv[] )
     barrier();
     ThreadPool::set_OS_warnings( 1 );
     print_processor( &tpool );
-    for ( int i = 0; i < N_threads; i++ )
-        TPOOL_ADD_WORK( &tpool, print_processor, ( &tpool ) );
-    tpool.wait_pool_finished();
+    launchAndTime( tpool, N_threads, print_processor, &tpool );
 
     // Run some basic tests
     barrier();
@@ -694,8 +594,8 @@ int main( int argc, char *argv[] )
         for ( int i = 0; i < N_work; i++ )
             waste_cpu( data1[i] );
     }
-    auto stop = std::chrono::high_resolution_clock::now();
-    double time = std::chrono::duration<double>(stop-start).count();
+    auto stop   = std::chrono::high_resolution_clock::now();
+    double time = std::chrono::duration<double>( stop - start ).count();
     printp( "Time for serial cycle = %0.0f us\n", 1e6 * time / N_it );
     printp( "Time for serial item = %0.0f ns\n", 1e9 * time / ( N_it * N_work ) );
     id = TPOOL_ADD_WORK( &tpool, waste_cpu, ( data1[0] ) );
@@ -728,20 +628,14 @@ int main( int argc, char *argv[] )
     tpool.wait_pool_finished();
     start = std::chrono::high_resolution_clock::now();
     sleep_inc( 1 );
-    stop = std::chrono::high_resolution_clock::now();
-    double sleep_serial = std::chrono::duration<double>(stop-start).count();
-    ids2.clear();
-    start = std::chrono::high_resolution_clock::now();
-    for ( int i = 0; i < N_threads; i++ )
-        ids2.push_back( TPOOL_ADD_WORK( &tpool, sleep_inc, ( 1 ) ) );
-    tpool.wait_all( N_procs_used, &ids2[0] );
-    stop = std::chrono::high_resolution_clock::now();
-    ids2.clear();
-    double sleep_parallel = std::chrono::duration<double>(stop-start).count();
+    stop                  = std::chrono::high_resolution_clock::now();
+    double sleep_serial   = std::chrono::duration<double>( stop - start ).count();
+    double sleep_parallel = launchAndTime( tpool, N_threads, sleep_inc, 1 );
     double sleep_speedup  = N_procs_used * sleep_serial / sleep_parallel;
     printf( "%i:  Speedup on %i sleeping threads: %0.3f\n", rank, N_procs_used, sleep_speedup );
     printf( "%i:    ts = %0.3f, tp = %0.3f\n", rank, sleep_serial, sleep_parallel );
-    if ( fabs( sleep_serial - 1.0 ) < 0.05 && fabs( sleep_parallel - 1.0 ) < 0.25 && sleep_speedup>3 )
+    if ( fabs( sleep_serial - 1.0 ) < 0.05 && fabs( sleep_parallel - 1.0 ) < 0.25 &&
+         sleep_speedup > 3 )
         ut.passes( "Passed thread sleep" );
     else
         ut.failure( "Failed thread sleep" );
@@ -770,11 +664,11 @@ int main( int argc, char *argv[] )
         // Run in serial
         start = std::chrono::high_resolution_clock::now();
         waste_cpu( N );
-        stop = std::chrono::high_resolution_clock::now();
-        double time_serial = std::chrono::duration<double>(stop-start).count();
+        stop               = std::chrono::high_resolution_clock::now();
+        double time_serial = std::chrono::duration<double>( stop - start ).count();
         // Run in parallel
-        double time_parallel2 = run_parallel( &tpool, N_procs_used, N / 1000 );
-        double time_parallel  = run_parallel( &tpool, N_procs_used, N );
+        double time_parallel  = launchAndTime( tpool, N_procs_used, waste_cpu, N );
+        double time_parallel2 = launchAndTime( tpool, N_procs_used, waste_cpu, N / 1000 );
         double speedup        = N_procs_used * time_serial / time_parallel;
         printf( "%i:  Speedup on %i procs: %0.3f\n", rank, N_procs_used, speedup );
         printf( "%i:    ts = %0.3f, tp = %0.3f, tp2 = %0.3f\n", rank, time_serial, time_parallel,
@@ -823,8 +717,8 @@ int main( int argc, char *argv[] )
         ids.reserve( 5 );
         global_sleep_count = 0; // Reset the count before this test
         ThreadPool::thread_id_t id0;
-        auto id1 = TPOOL_ADD_WORK( &tpool, sleep_inc, ( 1 ) );
-        auto id2 = TPOOL_ADD_WORK( &tpool, sleep_inc, ( 2 ) );
+        auto id1    = TPOOL_ADD_WORK( &tpool, sleep_inc, ( 1 ) );
+        auto id2    = TPOOL_ADD_WORK( &tpool, sleep_inc, ( 2 ) );
         auto *wait1 = new WorkItemFull<bool, int>( check_inc, 1 );
         auto *wait2 = new WorkItemFull<bool, int>( check_inc, 2 );
         wait1->add_dependency( id0 );
@@ -842,15 +736,15 @@ int main( int argc, char *argv[] )
         tpool.wait_pool_finished();
         // Test waiting on more dependencies than in the thread pool (changing priorities)
         ids.clear();
-        for (size_t i=0; i<20; i++)
+        for ( size_t i = 0; i < 20; i++ )
             ids.push_back( TPOOL_ADD_WORK( &tpool, sleep_inc2, ( 0.1 ) ) );
-        auto *wait3 = new WorkItemFull<void,double>( sleep_inc2, 0 );
+        auto *wait3 = new WorkItemFull<void, double>( sleep_inc2, 0 );
         wait3->add_dependencies( ids );
         id = tpool.add_work( wait3, 50 );
         tpool.wait( id );
         bool pass = true;
-        for (size_t i=0; i<ids.size(); i++)
-            pass = pass && ids[i].finished();
+        for ( auto &id : ids )
+            pass = pass && id.finished();
         ids.clear();
         if ( pass )
             ut.passes( "Dependencies2" );
@@ -896,21 +790,21 @@ int main( int argc, char *argv[] )
             for ( int i = 0; i < N_work; i++ )
                 delete work[i];
             auto t4 = std::chrono::high_resolution_clock::now();
-            time_create += to_ns(t2-t1);
-            time_run    += to_ns(t3-t2);
-            time_delete += to_ns(t4-t3);
+            time_create += to_ns( t2 - t1 );
+            time_run += to_ns( t3 - t2 );
+            time_delete += to_ns( t4 - t3 );
             if ( ( n + 1 ) % 100 == 0 )
                 printp( "Cycle %i of %i finished\n", n + 1, N_it );
         }
         stop = std::chrono::high_resolution_clock::now();
-        time = std::chrono::duration<double>(stop-start).count();
+        time = std::chrono::duration<double>( stop - start ).count();
         PROFILE_STOP( timer_name );
         printp( "   time = %0.0f ms\n", 1e3 * time );
         printp( "   time / cycle = %0.0f us\n", 1e6 * time / N_it );
         printp( "   average time / item = %0.0f ns\n", 1e9 * time / ( N_it * N_work ) );
-        printp( "      create = %i ns\n", static_cast<int>( time_create / ( N_it * N_work ) ) );
-        printp( "      run    = %i ns\n", static_cast<int>( time_run    / ( N_it * N_work ) ) );
-        printp( "      delete = %i us\n", static_cast<int>( time_delete / ( N_it * N_work ) ) );
+        printp( "      create = %i ns\n", time_create / ( N_it * N_work ) );
+        printp( "      run    = %i ns\n", time_run / ( N_it * N_work ) );
+        printp( "      delete = %i us\n", time_delete / ( N_it * N_work ) );
     }
 
     // Test the timing adding a single item
@@ -921,17 +815,17 @@ int main( int argc, char *argv[] )
         if ( it == 0 ) {
             printp( "Testing timmings (adding a single item to empty tpool):\n" );
             timer_name = "Add single item to empty pool";
-            tpool_ptr = &tpool0;
+            tpool_ptr  = &tpool0;
         } else if ( it == 1 ) {
             printp( "Testing timmings (adding a single item):\n" );
             timer_name = "Add single item to tpool";
-            tpool_ptr = &tpool;
+            tpool_ptr  = &tpool;
         }
         PROFILE_START( timer_name );
         std::vector<ThreadPool::thread_id_t> ids( N_work );
         int64_t time_add  = 0;
         int64_t time_wait = 0;
-        start = std::chrono::high_resolution_clock::now();
+        start             = std::chrono::high_resolution_clock::now();
         for ( int n = 0; n < N_it; n++ ) {
             auto t1 = std::chrono::high_resolution_clock::now();
             for ( int i = 0; i < N_work; i++ )
@@ -939,19 +833,19 @@ int main( int argc, char *argv[] )
             auto t2 = std::chrono::high_resolution_clock::now();
             tpool_ptr->wait_all( N_work, &ids[0] );
             auto t3 = std::chrono::high_resolution_clock::now();
-            time_add += to_ns(t2-t1);
-            time_wait += to_ns(t3-t2);
+            time_add += to_ns( t2 - t1 );
+            time_wait += to_ns( t3 - t2 );
             if ( ( n + 1 ) % 100 == 0 )
                 printp( "Cycle %i of %i finished\n", n + 1, N_it );
         }
         stop = std::chrono::high_resolution_clock::now();
-        time = std::chrono::duration<double>(stop-start).count();
+        time = std::chrono::duration<double>( stop - start ).count();
         PROFILE_STOP( timer_name );
         printp( "   time = %0.0f ms\n", 1e3 * time );
         printp( "   time / cycle = %0.0f us\n", 1e6 * time / N_it );
         printp( "   average time / item = %0.0f ns\n", 1e9 * time / ( N_it * N_work ) );
-        printp( "      create and add = %i ns\n", static_cast<int>( time_add / ( N_it * N_work ) ) );
-        printp( "      wait = %i us\n", static_cast<int>( time_wait / ( N_it * N_work ) ) );
+        printp( "      create and add = %i ns\n", time_add / ( N_it * N_work ) );
+        printp( "      wait = %i us\n", time_wait / ( N_it * N_work ) );
     }
 
     // Test the timing pre-creating the work items and adding multiple at a time
@@ -962,11 +856,11 @@ int main( int argc, char *argv[] )
         if ( it == 0 ) {
             printp( "Testing timmings (adding a block of items to empty tpool):\n" );
             timer_name = "Add multiple items to empty pool";
-            tpool_ptr = &tpool0;
+            tpool_ptr  = &tpool0;
         } else if ( it == 1 ) {
             printp( "Testing timmings (adding a block of items):\n" );
             timer_name = "Add multiple items to tpool";
-            tpool_ptr = &tpool;
+            tpool_ptr  = &tpool;
         }
         PROFILE_START( timer_name );
         int64_t time_create_work = 0;
@@ -978,26 +872,26 @@ int main( int argc, char *argv[] )
             auto t1 = std::chrono::high_resolution_clock::now();
             for ( int i = 0; i < N_work; i++ )
                 work[i] = ThreadPool::createWork<void, int>( waste_cpu, data1[i] );
-            auto t2 = std::chrono::high_resolution_clock::now();
+            auto t2  = std::chrono::high_resolution_clock::now();
             auto ids = tpool_ptr->add_work( work, priority );
-            auto t3 = std::chrono::high_resolution_clock::now();
+            auto t3  = std::chrono::high_resolution_clock::now();
             tpool_ptr->wait_all( ids );
             auto t4 = std::chrono::high_resolution_clock::now();
-            time_create_work += to_ns(t2-t1);
-            time_add_work += to_ns(t3-t2);
-            time_wait_work += to_ns(t4-t3);
+            time_create_work += to_ns( t2 - t1 );
+            time_add_work += to_ns( t3 - t2 );
+            time_wait_work += to_ns( t4 - t3 );
             if ( ( n + 1 ) % 100 == 0 )
                 printp( "Cycle %i of %i finished\n", n + 1, N_it );
         }
         stop = std::chrono::high_resolution_clock::now();
-        time = std::chrono::duration<double>(stop-start).count();
+        time = std::chrono::duration<double>( stop - start ).count();
         PROFILE_STOP( timer_name );
         printp( "   time = %0.0f ms\n", 1e3 * time );
         printp( "   time / cycle = %0.0f us\n", 1e6 * time / N_it );
         printp( "   average time / item = %0.0f ns\n", 1e9 * time / ( N_it * N_work ) );
-        printp( "      create = %i ns\n", static_cast<int>( time_create_work / ( N_it * N_work ) ) );
-        printp( "      add = %i ns\n",  static_cast<int>( time_add_work / ( N_it * N_work ) ) );
-        printp( "      wait = %i ns\n", static_cast<int>( time_wait_work / ( N_it * N_work ) ) );
+        printp( "      create = %i ns\n", time_create_work / ( N_it * N_work ) );
+        printp( "      add = %i ns\n", time_add_work / ( N_it * N_work ) );
+        printp( "      wait = %i ns\n", time_wait_work / ( N_it * N_work ) );
     }
 
     // Run a dependency test that tests a simple case that should keep the thread pool busy
@@ -1035,8 +929,8 @@ int main( int argc, char *argv[] )
     barrier();
     pass = true;
     try {
-        ThreadPool *tpool = new ThreadPool( MAX_NUM_THREADS - 1 );
-        if ( tpool->getNumThreads() != MAX_NUM_THREADS - 1 )
+        ThreadPool *tpool = new ThreadPool( ThreadPool::MAX_NUM_THREADS - 1 );
+        if ( tpool->getNumThreads() != ThreadPool::MAX_NUM_THREADS - 1 )
             pass = false;
         if ( !ThreadPool::is_valid( tpool ) )
             pass = false;
@@ -1056,14 +950,14 @@ int main( int argc, char *argv[] )
     // Print the test results
     barrier();
     ut.report();
-    int N_errors = static_cast<int>( ut.NumFailGlobal() );
+    auto N_errors = static_cast<int>( ut.NumFailGlobal() );
 
     // Shudown MPI
     pout << "Shutting down\n";
     barrier();
 #ifdef USE_TIMER
     if ( rank == 0 )
-        MemoryApp::print( std::cout );
+        MemoryApp::print( pout );
 #endif
 #ifdef USE_MPI
     MPI_Finalize();
diff --git a/threadpool/thread_pool.cpp b/threadpool/thread_pool.cpp
index 30281727..9b0ff4fd 100644
--- a/threadpool/thread_pool.cpp
+++ b/threadpool/thread_pool.cpp
@@ -5,14 +5,14 @@
 #include "ProfilerApp.h"
 #include <algorithm>
 #include <bitset>
+#include <chrono>
 #include <climits>
+#include <cstdio>
+#include <cstdlib>
 #include <iostream>
 #include <stdexcept>
-#include <stdio.h>
-#include <stdlib.h>
-#include <typeinfo>
 #include <thread>
-#include <chrono>
+#include <typeinfo>
 
 
 #define perr std::cerr
@@ -22,6 +22,15 @@
 
 // OS specific includes / definitions
 // clang-format off
+#if defined( WIN32 ) || defined( _WIN32 ) || defined( WIN64 ) || defined( _WIN64 )
+    #define USE_WINDOWS
+#elif defined( __APPLE__ )
+    #define USE_MAC
+#elif defined( __linux ) || defined( __unix ) || defined( __posix )
+    #define USE_LINUX
+#else
+    #error Unknown OS
+#endif
 #if defined( USE_WINDOWS )
     #include <process.h>
     #include <windows.h>
@@ -54,41 +63,45 @@
 
 // Set some macros
 #if PROFILE_THREADPOOL_PERFORMANCE
-    #define PROFILE_THREADPOOL_START( X )  PROFILE_START( X, 3 )
-    #define PROFILE_THREADPOOL_START2( X ) PROFILE_START2( X, 3 )
-    #define PROFILE_THREADPOOL_STOP( X )   PROFILE_STOP( X, 3 )
-    #define PROFILE_THREADPOOL_STOP2( X )  PROFILE_STOP2( X, 3 )
+#define PROFILE_THREADPOOL_START( X ) PROFILE_START( X, 3 )
+#define PROFILE_THREADPOOL_START2( X ) PROFILE_START2( X, 3 )
+#define PROFILE_THREADPOOL_STOP( X ) PROFILE_STOP( X, 3 )
+#define PROFILE_THREADPOOL_STOP2( X ) PROFILE_STOP2( X, 3 )
 #else
-    #define PROFILE_THREADPOOL_START( X ) \
-        do {                          \
-        } while ( 0 )
-    #define PROFILE_THREADPOOL_START2( X ) \
-        do {                           \
-        } while ( 0 )
-    #define PROFILE_THREADPOOL_STOP( X ) \
-        do {                         \
-        } while ( 0 )
-    #define PROFILE_THREADPOOL_STOP2( X ) \
-        do {                          \
-        } while ( 0 )
+#define PROFILE_THREADPOOL_START( X ) \
+    do {                              \
+    } while ( 0 )
+#define PROFILE_THREADPOOL_START2( X ) \
+    do {                               \
+    } while ( 0 )
+#define PROFILE_THREADPOOL_STOP( X ) \
+    do {                             \
+    } while ( 0 )
+#define PROFILE_THREADPOOL_STOP2( X ) \
+    do {                              \
+    } while ( 0 )
 #endif
 #if MONITOR_THREADPOOL_PERFORMANCE == 1
-    #define accumulate( x, t1, t2 ) AtomicOperations::atomic_add( &x, \
-        std::chrono::duration_cast<std::chrono::nanoseconds>(t2-t1).count() );
+#define accumulate( x, t1, t2 )   \
+    AtomicOperations::atomic_add( \
+        &x, std::chrono::duration_cast<std::chrono::nanoseconds>( t2 - t1 ).count() );
 #endif
 
 
 #if MONITOR_THREADPOOL_PERFORMANCE == 1
-    static AtomicOperations::int64_atomic total_add_work_time[5] = {0,0,0,0,0};
+static AtomicOperations::int64_atomic total_add_work_time[5] = { 0, 0, 0, 0, 0 };
 #endif
 
 
 // Helper functions
-template <class T>
-void quicksort( int N, T* data );
-template <class T>
-inline void quicksort( std::vector<T> &x ) { quicksort((int)x.size(),x.data()); }
-static inline int find_id( int, const ThreadPool::thread_id_t*, const ThreadPool::thread_id_t& );
+template<class T>
+void quicksort( int N, T *data );
+template<class T>
+inline void quicksort( std::vector<T> &x )
+{
+    quicksort( (int) x.size(), x.data() );
+}
+static inline int find_id( int, const ThreadPool::thread_id_t *, const ThreadPool::thread_id_t & );
 
 
 // Function to generate a random size_t number (excluding 0 and ~0)
@@ -116,8 +129,8 @@ static size_t rand_size_t()
 
 
 /******************************************************************
-* Run some basic compile-time checks                              *
-******************************************************************/
+ * Run some basic compile-time checks                              *
+ ******************************************************************/
 #if MAX_NUM_THREADS % 64 != 0
 // We use a bit array for d_active and d_cancel
 #error MAX_NUM_THREADS must be a multiple of 64
@@ -130,47 +143,52 @@ static size_t rand_size_t()
 // We store the indicies to the queue list as short ints
 #error MAX_QUEUED must < 65535
 #endif
+// Check the c++ std
+#if CXX_STD == 98
+#error Thread pool class requires c++11 or newer
+#endif
 
 
 /******************************************************************
-* Get/Set a bit                                                   *
-* Note: these functions are thread-safe                           *
-******************************************************************/
+ * Get/Set a bit                                                   *
+ * Note: these functions are thread-safe                           *
+ ******************************************************************/
 static inline void set_bit( volatile AtomicOperations::int64_atomic *x, size_t index )
 {
     uint64_t mask = 0x01;
     mask <<= index % 64;
-    size_t i = index / 64;
+    size_t i  = index / 64;
     bool test = false;
     while ( !test ) {
         AtomicOperations::int64_atomic y = x[i];
-        test = AtomicOperations::atomic_compare_and_swap( &x[i], y, (y|mask) );
+        test = AtomicOperations::atomic_compare_and_swap( &x[i], y, ( y | mask ) );
     }
 }
 static inline void unset_bit( volatile AtomicOperations::int64_atomic *x, size_t index )
 {
     uint64_t mask = 0x01;
     mask <<= index % 64;
-    mask = ~mask;
-    size_t i = index / 64;
+    mask      = ~mask;
+    size_t i  = index / 64;
     bool test = false;
     while ( !test ) {
         AtomicOperations::int64_atomic y = x[i];
-        test = AtomicOperations::atomic_compare_and_swap( &x[i], y, (y&mask) );
+        test = AtomicOperations::atomic_compare_and_swap( &x[i], y, ( y & mask ) );
     }
 }
 static inline bool get_bit( const volatile AtomicOperations::int64_atomic *x, size_t index )
 {
     uint64_t mask = 0x01;
     mask <<= index % 64;
-    AtomicOperations::int64_atomic y = x[index / 64];   // This is thread-safe since we only care about a single bit
+    // This is thread-safe since we only care about a single bit
+    AtomicOperations::int64_atomic y = x[index / 64]; 
     return ( y & mask ) != 0;
 }
 
 
 /******************************************************************
-* Simple function to check if the parity is odd (true) or even    *
-******************************************************************/
+ * Simple function to check if the parity is odd (true) or even    *
+ ******************************************************************/
 static inline bool is_odd8( size_t x )
 { // This only works for 64-bit integers
     x ^= ( x >> 1 );
@@ -181,7 +199,7 @@ static inline bool is_odd8( size_t x )
     x ^= ( x >> 32 );
     return ( x & 0x01 ) > 0;
 }
-template <class int_type>
+template<class int_type>
 static inline int count_bits( int_type x )
 {
     int count = 0;
@@ -194,8 +212,18 @@ static inline int count_bits( int_type x )
 
 
 /******************************************************************
-* Set the bahvior of OS warnings                                  *
-******************************************************************/
+ * Set the global constants                                        *
+ ******************************************************************/
+constexpr int ThreadPool::MAX_NUM_THREADS;
+constexpr int ThreadPool::MAX_QUEUED;
+constexpr int ThreadPool::MAX_WAIT;
+constexpr bool ThreadPool::PROFILE_THREADPOOL_PERFORMANCE;
+constexpr bool ThreadPool::MONITOR_THREADPOOL_PERFORMANCE;
+
+
+/******************************************************************
+ * Set the behavior of OS warnings                                 *
+ ******************************************************************/
 static int global_OS_behavior = 0;
 std::mutex OS_warning_mutex;
 void ThreadPool::set_OS_warnings( int behavior )
@@ -213,11 +241,14 @@ static void OS_warning( const std::string &message )
     }
     OS_warning_mutex.unlock();
 }
-
+void ThreadPool::setErrorHandler( std::function<void( const std::string & )> fun )
+{
+    d_errorHandler = fun;
+}
 
 /******************************************************************
-* Function to return the number of processors availible           *
-******************************************************************/
+ * Function to return the number of processors availible           *
+ ******************************************************************/
 int ThreadPool::getNumberOfProcessors()
 {
 #if defined( USE_LINUX ) || defined( USE_MAC )
@@ -233,8 +264,8 @@ int ThreadPool::getNumberOfProcessors()
 
 
 /******************************************************************
-* Function to return the processor number of the current thread   *
-******************************************************************/
+ * Function to return the processor number of the current thread   *
+ ******************************************************************/
 int ThreadPool::getCurrentProcessor()
 {
 #if defined( USE_LINUX )
@@ -251,8 +282,8 @@ int ThreadPool::getCurrentProcessor()
 
 
 /******************************************************************
-* Function to get/set the affinity of the current process         *
-******************************************************************/
+ * Function to get/set the affinity of the current process         *
+ ******************************************************************/
 std::vector<int> ThreadPool::getProcessAffinity()
 {
     std::vector<int> procs;
@@ -325,8 +356,8 @@ void ThreadPool::setProcessAffinity( std::vector<int> procs )
 
 
 /******************************************************************
-* Function to get the thread affinities                           *
-******************************************************************/
+ * Function to get the thread affinities                           *
+ ******************************************************************/
 #ifdef USE_WINDOWS
 DWORD GetThreadAffinityMask( HANDLE thread )
 {
@@ -387,7 +418,7 @@ std::vector<int> ThreadPool::getThreadAffinity( int thread ) const
     if ( thread >= getNumThreads() )
         std::logic_error( "Invalid thread number" );
     std::vector<int> procs;
-    auto handle = const_cast<std::thread&>( d_thread[thread] ).native_handle();
+    auto handle = const_cast<std::thread &>( d_thread[thread] ).native_handle();
 #ifdef USE_LINUX
 #ifdef _GNU_SOURCE
     cpu_set_t mask;
@@ -423,8 +454,8 @@ std::vector<int> ThreadPool::getThreadAffinity( int thread ) const
 
 
 /******************************************************************
-* Function to set the thread affinity                             *
-******************************************************************/
+ * Function to set the thread affinity                             *
+ ******************************************************************/
 void ThreadPool::setThreadAffinity( std::vector<int> procs )
 {
 #ifdef USE_LINUX
@@ -458,7 +489,7 @@ void ThreadPool::setThreadAffinity( int thread, std::vector<int> procs ) const
 {
     if ( thread >= getNumThreads() )
         std::logic_error( "Invalid thread number" );
-    auto handle = const_cast<std::thread&>( d_thread[thread] ).native_handle();
+    auto handle = const_cast<std::thread &>( d_thread[thread] ).native_handle();
 #ifdef USE_LINUX
 #ifdef __USE_GNU
     cpu_set_t mask;
@@ -490,15 +521,15 @@ void ThreadPool::setThreadAffinity( int thread, std::vector<int> procs ) const
 
 
 /******************************************************************
-* Function to perform some basic checks before we start           *
-******************************************************************/
+ * Function to perform some basic checks before we start           *
+ ******************************************************************/
 void ThreadPool::check_startup( size_t size0 )
 {
     // Check the size of the class to make sure that we don't have any
     // byte alignment problems between a library implimentation and a calling pacakge
     size_t size1 = sizeof( ThreadPool );
-    size_t size2 = ( (size_t) &d_NULL_HEAD ) - ( ( size_t ) this ) + sizeof( size_t );
-    size_t size3 = ( (size_t) &d_NULL_TAIL ) - ( ( size_t ) this ) + sizeof( size_t );
+    size_t size2 = ( (size_t) &d_NULL_HEAD ) - ( (size_t) this ) + sizeof( size_t );
+    size_t size3 = ( (size_t) &d_NULL_TAIL ) - ( (size_t) this ) + sizeof( size_t );
     if ( size0 != size1 || size1 < size2 || size1 < size3 )
         throw std::logic_error( "Internal data format problem" );
     // Check the size of variables
@@ -517,7 +548,7 @@ void ThreadPool::check_startup( size_t size0 )
     ThreadPool::thread_id_t id;
     if ( id.getPriority() != -128 )
         pass = false;
-    id.reset( 3, 564, NULL );
+    id.reset( 3, 564, nullptr );
     if ( id.getPriority() != 3 || id.getLocalID() != 564 )
         pass = false;
     if ( count_bits( 0x0 ) != 0 || count_bits( 0x03 ) != 2 )
@@ -530,8 +561,10 @@ void ThreadPool::check_startup( size_t size0 )
         if ( is_odd8( ~( (size_t) 0 ) ) || !is_odd8( thread_id_t::maxThreadID ) )
             pass = false;
         for ( size_t i = 0; i < 1024; i++ ) {
-            if ( ( count_bits( thread_id_t::maxThreadID - i ) % 2 == 1 ) != is_odd8( thread_id_t::maxThreadID - i ) ) {
-                printp( "%i %i %s\n", count_bits( thread_id_t::maxThreadID - i ), is_odd8( thread_id_t::maxThreadID - i ) ? 1 : 0,
+            if ( ( count_bits( thread_id_t::maxThreadID - i ) % 2 == 1 ) !=
+                 is_odd8( thread_id_t::maxThreadID - i ) ) {
+                printp( "%i %i %s\n", count_bits( thread_id_t::maxThreadID - i ),
+                    is_odd8( thread_id_t::maxThreadID - i ) ? 1 : 0,
                     std::bitset<64>( thread_id_t::maxThreadID - i ).to_string().c_str() );
                 pass = false;
             }
@@ -550,27 +583,28 @@ void ThreadPool::check_startup( size_t size0 )
 
 
 /******************************************************************
-* Function to initialize the thread pool                          *
-******************************************************************/
+ * Function to initialize the thread pool                          *
+ ******************************************************************/
 void ThreadPool::initialize( const int N, const char *affinity, int N_procs, const int *procs )
 {
     // Initialize the header/tail
     d_NULL_HEAD = rand_size_t();
     d_NULL_TAIL = d_NULL_HEAD;
     // Initialize the variables to NULL values
-    d_id_assign    = 0;
-    d_signal_empty = false;
-    d_signal_count = 0;
-    d_N_threads    = 0;
-    d_num_active   = 0;
-    d_N_added      = 0;
-    d_N_started    = 0;
-    d_N_finished   = 0;
+    d_id_assign     = 0;
+    d_signal_empty  = false;
+    d_signal_count  = 0;
+    d_N_threads     = 0;
+    d_num_active    = 0;
+    d_N_added       = 0;
+    d_N_started     = 0;
+    d_N_finished    = 0;
+    d_max_wait_time = 600;
     memset( (void *) d_active, 0, MAX_NUM_THREADS / 8 );
     memset( (void *) d_cancel, 0, MAX_NUM_THREADS / 8 );
     d_wait_last = nullptr;
-    for ( int i     = 0; i < MAX_WAIT; i++ )
-        d_wait[i]   = nullptr;
+    for ( auto &i : d_wait )
+        i = nullptr;
     // Initialize the id
     d_id_assign = thread_id_t::maxThreadID;
     // Create the threads
@@ -579,14 +613,14 @@ void ThreadPool::initialize( const int N, const char *affinity, int N_procs, con
 
 
 /******************************************************************
-* This is the de-constructor                                      *
-******************************************************************/
+ * This is the de-constructor                                      *
+ ******************************************************************/
 ThreadPool::~ThreadPool()
 {
-    if ( !is_valid( this ) ) {
-        std::cerr << "Thread pool is not valid\n";
-        std::terminate();
-    }
+    DISABLE_WARNINGS
+    if ( !is_valid( this ) )
+        throw std::logic_error( "Thread pool is not valid" );
+    ENABLE_WARNINGS
     // Destroy the threads
     setNumThreads( 0 );
     // Delete all remaining data
@@ -598,16 +632,15 @@ ThreadPool::~ThreadPool()
     // Print the performance metrics
     printp( "ThreadPool Performance:\n" );
     printp( "add_work:  %lu us,  %lu us,  %lu us,  %lu us,  %lu us\n",
-        total_add_work_time[0]/1000, total_add_work_time[1]/1000,
-        total_add_work_time[2]/1000, total_add_work_time[3]/1000,
-        total_add_work_time[4]/1000 );
+        total_add_work_time[0] / 1000, total_add_work_time[1] / 1000, total_add_work_time[2] / 1000,
+        total_add_work_time[3] / 1000, total_add_work_time[4] / 1000 );
 #endif
 }
 
 
 /******************************************************************
-* Check if the pointer points to a valid thread pool object       *
-******************************************************************/
+ * Check if the pointer points to a valid thread pool object       *
+ ******************************************************************/
 bool ThreadPool::is_valid( const ThreadPool *tpool )
 {
     if ( tpool == nullptr )
@@ -621,8 +654,8 @@ bool ThreadPool::is_valid( const ThreadPool *tpool )
 
 
 /******************************************************************
-* This function creates the threads in the thread pool            *
-******************************************************************/
+ * This function creates the threads in the thread pool            *
+ ******************************************************************/
 void ThreadPool::setNumThreads(
     int num_worker_threads, const char *affinity2, int N_procs, const int *procs )
 {
@@ -643,8 +676,8 @@ void ThreadPool::setNumThreads(
     int d_N_threads_diff = num_worker_threads - d_N_threads;
     if ( d_N_threads_diff > 0 ) {
         // Check that no threads are in the process of being deleted
-        for ( int i = 0; i < MAX_NUM_THREADS / 64; i++ ) {
-            if ( d_cancel[i] != 0 )
+        for ( long i : d_cancel ) {
+            if ( i != 0 )
                 throw std::logic_error(
                     "Threads are being created and destroyed at the same time" );
         }
@@ -670,11 +703,11 @@ void ThreadPool::setNumThreads(
             j++;
         }
         // Wait for all of the threads to finish initialization
-        while ( 1 ) {
-            std::this_thread::sleep_for( std::chrono::milliseconds(25) );
+        while ( true ) {
+            std::this_thread::sleep_for( std::chrono::milliseconds( 25 ) );
             bool wait = false;
-            for ( int i = 0; i < MAX_NUM_THREADS / 64; i++ ) {
-                if ( d_cancel[i] != 0 )
+            for ( long i : d_cancel ) {
+                if ( i != 0 )
                     wait = true;
             }
             if ( !wait )
@@ -684,7 +717,7 @@ void ThreadPool::setNumThreads(
 #if defined( USE_LINUX ) || defined( USE_MAC )
         pthread_attr_destroy( &attr );
 #endif
-        std::this_thread::sleep_for( std::chrono::milliseconds(25) );
+        std::this_thread::sleep_for( std::chrono::milliseconds( 25 ) );
         delete[] tmp;
     } else if ( d_N_threads_diff < 0 ) {
         // Reduce the number of threads
@@ -697,7 +730,7 @@ void ThreadPool::setNumThreads(
             set_bit( d_cancel, d_N_threads - 1 + i );
         // Wake all threads to process the shutdown
         d_wait_work.notify_all();
-        std::this_thread::sleep_for( std::chrono::milliseconds(25) );
+        std::this_thread::sleep_for( std::chrono::milliseconds( 25 ) );
         // Wait for the threads to close
         for ( int i = 0; i > d_N_threads_diff; i-- ) {
             d_thread[d_N_threads - 1 + i].join();
@@ -732,13 +765,13 @@ void ThreadPool::setNumThreads(
         // We do not have a list of cpus to use, do nothing (OS not supported)
     } else if ( affinity == "none" ) {
         // We are using the default thread affinities (all threads get all procs of the program)
-        for ( int i    = 0; i < d_N_threads; i++ )
+        for ( int i = 0; i < d_N_threads; i++ )
             t_procs[i] = cpus;
     } else if ( affinity == "independent" ) {
         // We want to use an independent set of processors for each thread
         if ( (int) cpus.size() == d_N_threads ) {
             // The number of cpus matches the number of threads
-            for ( int i    = 0; i < d_N_threads; i++ )
+            for ( int i = 0; i < d_N_threads; i++ )
                 t_procs[i] = std::vector<int>( 1, cpus[i] );
         } else if ( (int) cpus.size() > d_N_threads ) {
             // There are more cpus than threads, threads will use more the one processor
@@ -752,7 +785,7 @@ void ThreadPool::setNumThreads(
             }
         } else {
             // There are fewer cpus than threads, threads will share a processor
-            int N_threads_proc =
+            auto N_threads_proc =
                 static_cast<int>( ( cpus.size() + d_N_threads - 1 ) / cpus.size() );
             for ( int i = 0; i < d_N_threads; i++ )
                 t_procs[i].push_back( cpus[i / N_threads_proc] );
@@ -776,10 +809,10 @@ void ThreadPool::setNumThreads(
 
 
 /******************************************************************
-* This is the function that controls the individual thread and    *
-* allows it to do work.                                           *
-* Note: this function is lock free                                *
-******************************************************************/
+ * This is the function that controls the individual thread and    *
+ * allows it to do work.                                           *
+ * Note: this function is lock free                                *
+ ******************************************************************/
 void ThreadPool::tpool_thread( int thread_id )
 {
     bool shutdown         = false;
@@ -797,8 +830,8 @@ void ThreadPool::tpool_thread( int thread_id )
         try {
             std::vector<int> cpus = ThreadPool::getProcessAffinity();
             printp( "%i cpus for current thread: ", (int) cpus.size() );
-            for ( size_t i = 0; i < cpus.size(); i++ )
-                printp( "%i ", cpus[i] );
+            for ( int cpu : cpus )
+                printp( "%i ", cpu );
             printp( "\n" );
         } catch ( ... ) {
             printp( "Unable to get process affinity\n" );
@@ -811,24 +844,39 @@ void ThreadPool::tpool_thread( int thread_id )
         // Check if there is work to do
         if ( d_queue_list.size() > 0 ) {
             // Get next work item to process
-            auto work_id = d_queue_list.remove( []( const thread_id_t& id ) { return id.ready(); } );
+            auto work_id =
+                d_queue_list.remove( []( const thread_id_t &id ) { return id.ready(); } );
             if ( work_id.isNull() ) {
                 std::this_thread::yield();
                 continue;
             }
-            WorkItem *work = work_id.work( );
+            WorkItem *work = work_id.work();
             AtomicOperations::atomic_increment( &d_N_started );
             // Start work here
             PROFILE_THREADPOOL_START( "thread working" );
-            work->d_state  = 2;
-            work->run();
-            work->d_state  = 3;
+            work->d_state = 2;
+            if ( d_errorHandler ) {
+                try {
+                    work->run();
+                } catch ( std::exception &e ) {
+                    auto msg = Utilities::stringf(
+                        "Error, caught exception in thread %i:\n  %s\n", thread_id, e.what() );
+                    d_errorHandler( msg );
+                } catch ( ... ) {
+                    auto msg = Utilities::stringf(
+                        "Error, caught unknown exception in thread %i\n", thread_id );
+                    d_errorHandler( msg );
+                }
+            } else {
+                work->run();
+            }
+            work->d_state = 3;
             PROFILE_THREADPOOL_STOP( "thread working" );
             AtomicOperations::atomic_increment( &d_N_finished );
             // Check if any threads are waiting on the current work item
             // This can be done without blocking
-            for ( int i = 0; i < MAX_WAIT; i++ ) {
-                const wait_ids_struct *wait = const_cast<const wait_ids_struct *>(d_wait[i]);
+            for ( auto &i : d_wait ) {
+                auto wait = AtomicOperations::atomic_get( &i );
                 if ( wait != nullptr )
                     wait->id_finished( work_id );
             }
@@ -849,7 +897,7 @@ void ThreadPool::tpool_thread( int thread_id )
             }
             // Wait for work
             PROFILE_THREADPOOL_STOP2( "thread active" );
-            d_wait_work.wait_for(1e-3);
+            d_wait_work.wait_for( 1e-3 );
             PROFILE_THREADPOOL_START2( "thread active" );
             AtomicOperations::atomic_increment( &d_num_active );
             set_bit( d_active, thread_id );
@@ -865,21 +913,22 @@ void ThreadPool::tpool_thread( int thread_id )
 
 
 /******************************************************************
-* This is the function that adds work to the thread pool          *
-* Note: this version uses a last in - first out work scheduling.  *
-******************************************************************/
-inline void ThreadPool::add_work( const ThreadPool::thread_id_t& id )
+ * This is the function that adds work to the thread pool          *
+ * Note: this version uses a last in - first out work scheduling.  *
+ ******************************************************************/
+inline void ThreadPool::add_work( const ThreadPool::thread_id_t &id )
 {
-    auto work = id.work();
+    auto work     = id.work();
     work->d_state = 1;
     // Check and change priorities of dependency ids
     const int priority = id.getPriority();
-    for (int i=0; i<work->d_N_ids; i++) {
-        const auto& id1 = work->d_ids[i];
-        if ( !id1.started() && id1<id ) {
+    for ( int i = 0; i < work->d_N_ids; i++ ) {
+        const auto &id1 = work->d_ids[i];
+        if ( !id1.started() && id1 < id ) {
             // Remove and add the id back with a higher priority
-            auto id2 = d_queue_list.remove( []( const thread_id_t& a, const thread_id_t& b ) { return a==b; }, id1 );
-            id2.setPriority( std::max(priority,id2.getPriority()) );
+            auto id2 = d_queue_list.remove(
+                []( const thread_id_t &a, const thread_id_t &b ) { return a == b; }, id1 );
+            id2.setPriority( std::max( priority, id2.getPriority() ) );
             d_queue_list.insert( id2 );
         }
     }
@@ -894,7 +943,7 @@ void ThreadPool::add_work(
     if ( N > block_size ) {
         size_t i = 0;
         while ( i < N ) {
-            add_work( std::min(N-i,block_size), &work[i], &priority[i], &ids[i] );
+            add_work( std::min( N - i, block_size ), &work[i], &priority[i], &ids[i] );
             i += block_size;
         }
         return;
@@ -905,7 +954,7 @@ void ThreadPool::add_work(
 #endif
     // Create the thread ids (can be done without blocking)
     for ( size_t i = 0; i < N; i++ )
-        ids[i].reset( priority[i], AtomicOperations::atomic_decrement(&d_id_assign), work[i] );
+        ids[i].reset( priority[i], AtomicOperations::atomic_decrement( &d_id_assign ), work[i] );
 #if MONITOR_THREADPOOL_PERFORMANCE
     auto t2 = std::chrono::high_resolution_clock::now();
     accumulate( total_add_work_time[0], t1, t2 );
@@ -913,23 +962,23 @@ void ThreadPool::add_work(
     // If there are no threads, perform the work immediately
     if ( d_N_threads < 1 ) {
         for ( size_t i = 0; i < N; i++ ) {
-            work[i]->d_state  = 2;
+            work[i]->d_state = 2;
             work[i]->run();
-            work[i]->d_state  = 3;
+            work[i]->d_state = 3;
         }
-        #if MONITOR_THREADPOOL_PERFORMANCE
-            auto t5 = std::chrono::high_resolution_clock::now();
-            accumulate( total_add_work_time[4], t2, t5 );
-        #endif
+#if MONITOR_THREADPOOL_PERFORMANCE
+        auto t5 = std::chrono::high_resolution_clock::now();
+        accumulate( total_add_work_time[4], t2, t5 );
+#endif
         PROFILE_THREADPOOL_STOP2( "add_work" );
         return;
     }
     // Wait for enough room in the queue (doesn't need blocking since it isn't that precise)
     if ( N > static_cast<size_t>( MAX_QUEUED - d_queue_list.size() ) ) {
-        int N_wait = static_cast<int>( N - ( MAX_QUEUED - d_queue_list.size() ) );
+        auto N_wait = static_cast<int>( N - ( MAX_QUEUED - d_queue_list.size() ) );
         while ( N_wait > 0 ) {
             d_signal_count = static_cast<unsigned char>( std::min( N_wait, 255 ) );
-            d_wait_finished.wait_for(1e-4);
+            d_wait_finished.wait_for( 1e-4 );
             N_wait = static_cast<int>( N - ( MAX_QUEUED - d_queue_list.size() ) );
         }
     }
@@ -965,19 +1014,8 @@ void ThreadPool::add_work(
 
 
 /******************************************************************
-* This function removes a finished work item                      *
-******************************************************************/
-ThreadPool::WorkItem *ThreadPool::getFinishedWorkItem( ThreadPool::thread_id_t id ) const
-{
-    if ( id.finished() )
-        return id.work();
-    return nullptr;
-}
-
-
-/******************************************************************
-* This function waits for a some of the work items to finish      *
-******************************************************************/
+ * This function waits for a some of the work items to finish      *
+ ******************************************************************/
 static inline void check_finished(
     size_t N_work, const ThreadPool::thread_id_t *ids, size_t &N_finished, bool *finished )
 {
@@ -1004,8 +1042,8 @@ int ThreadPool::wait_some(
             N_finished++;
         }
         size_t local_id = ids[k].getLocalID();
-        bool test       = local_id == 0 || local_id > thread_id_t::maxThreadID || local_id <= next_id;
-        test            = test && !finished[k];
+        bool test = local_id == 0 || local_id > thread_id_t::maxThreadID || local_id <= next_id;
+        test      = test && !finished[k];
         if ( test )
             throw std::logic_error( "Invalid ids for wait" );
     }
@@ -1018,7 +1056,7 @@ int ThreadPool::wait_some(
     auto tmp = new wait_ids_struct( N_work, ids, N_wait, d_cond_pool, MAX_WAIT, d_wait );
     // Wait for the ids
     auto t1 = std::chrono::high_resolution_clock::now();
-    while ( !tmp->wait_for(0.01) ) {
+    while ( !tmp->wait_for( 0.01 ) ) {
         check_wait_time( t1 );
     }
     // Update the ids that have finished
@@ -1027,33 +1065,35 @@ int ThreadPool::wait_some(
         throw std::logic_error( "Internal error: failed to wait" );
     // Delete the wait event struct
     // Note: we want to maintain the reference in case a thread is still using it
-    // Note: technically this should be atomic
-    std::swap(d_wait_last,tmp);
+    // Note: technically this should be atomic, but it really isn't necessary here
+    std::swap( d_wait_last, tmp );
     delete tmp;
     return N_finished;
 }
 
 
 /******************************************************************
-* This function waits for all of the threads to finish their work *
-******************************************************************/
-void ThreadPool::check_wait_time( std::chrono::time_point<std::chrono::high_resolution_clock>& t1 ) const
+ * This function waits for all of the threads to finish their work *
+ ******************************************************************/
+void ThreadPool::check_wait_time(
+    std::chrono::time_point<std::chrono::high_resolution_clock> &t1 ) const
 {
     auto t2 = std::chrono::high_resolution_clock::now();
-    if ( std::chrono::duration_cast<std::chrono::seconds>(t2-t1).count() > MAX_WAIT_TIME_DEBUG ) {
-        std::cout << "Warning: Maximum wait time in ThreadPool exceeded, threads may be hung\n";
-        std::cout << "N_active: " << d_num_active << std::endl;
-        std::cout << "N_queued: " << d_queue_list.size() << std::endl;
-        std::cout << "N_added: " << d_N_added << std::endl;
-        std::cout << "N_started: " << d_N_started << std::endl;
-        std::cout << "N_finished: " << d_N_finished << std::endl;
-        std::cout << "queue.insert(): " << d_queue_list.N_insert() << std::endl;
-        std::cout << "queue.remove(): " << d_queue_list.N_remove() << std::endl;
-        std::cout << "Stack Trace:\n";
-        auto call_stack = StackTrace::getAllCallStacks( );
+    if ( std::chrono::duration_cast<std::chrono::seconds>( t2 - t1 ).count() > d_max_wait_time ) {
+        pout << "Warning: Maximum wait time in ThreadPool exceeded, threads may be hung\n";
+        pout << "N_active: " << d_num_active << std::endl;
+        pout << "N_queued: " << d_queue_list.size() << std::endl;
+        pout << "N_added: " << d_N_added << std::endl;
+        pout << "N_started: " << d_N_started << std::endl;
+        pout << "N_finished: " << d_N_finished << std::endl;
+        pout << "queue.insert(): " << d_queue_list.N_insert() << std::endl;
+        pout << "queue.remove(): " << d_queue_list.N_remove() << std::endl;
+        pout << "Stack Trace:\n";
+        auto call_stack = StackTrace::getAllCallStacks();
+        StackTrace::cleanupStackTrace( call_stack );
         auto text = call_stack.print( "  " );
-        for ( auto& line : text )
-            std::cout << line << std::endl;
+        for ( auto &line : text )
+            pout << line << std::endl;
         t1 = std::chrono::high_resolution_clock::now();
     }
 }
@@ -1068,82 +1108,91 @@ void ThreadPool::wait_pool_finished() const
     while ( d_num_active > 0 || d_queue_list.size() > 0 ) {
         check_wait_time( t1 );
         d_signal_empty = true;
-        d_wait_finished.wait_for(10e-6);
+        d_wait_finished.wait_for( 10e-6 );
     }
     d_signal_empty = false;
 }
 
 
 /******************************************************************
-* Member functions of wait_ids_struct                             *
-******************************************************************/
-ThreadPool::wait_ids_struct::wait_ids_struct( size_t N, const ThreadPool::thread_id_t *ids, size_t N_wait,
-    AtomicOperations::pool<condition_variable,128>& cv_pool, int N_wait_list, volatile wait_ids_struct **list ):
-    d_wait( N_wait ),
-    d_N(0),
-    d_cv_pool( cv_pool ),
-    d_wait_event( cv_pool.get() )
+ * Member functions of wait_ids_struct                             *
+ ******************************************************************/
+ThreadPool::wait_ids_struct::wait_ids_struct( size_t N, const ThreadPool::thread_id_t *ids,
+    size_t N_wait, AtomicOperations::pool<condition_variable, 128> &cv_pool, int N_wait_list,
+    volatile wait_ids_struct **list )
+    : d_wait( N_wait ), d_N( 0 ), d_cv_pool( cv_pool ), d_wait_event( cv_pool.get() )
 {
     d_ids = new ThreadPool::thread_id_t[N];
     for ( size_t i = 0; i < N; i++ ) {
         if ( ids[i].finished() )
-            d_wait = std::max(d_wait-1,0);
+            d_wait = std::max( d_wait - 1, 0 );
         else
             d_ids[d_N++] = ids[i];
     }
     quicksort( d_N, d_ids );
     d_finished = new bool[d_N];
-    memset((void*)d_finished,0,d_N);
+    memset( (void *) d_finished, 0, d_N );
     int i = 0;
-    while ( !AtomicOperations::atomic_compare_and_swap( (void *volatile *) &list[i], nullptr, this ) ) { i = (i+1)%N_wait_list; }
+    while (
+        !AtomicOperations::atomic_compare_and_swap( (void *volatile *) &list[i], nullptr, this ) ) {
+        i = ( i + 1 ) % N_wait_list;
+    }
     d_ptr = &list[i];
 }
-void ThreadPool::wait_ids_struct::id_finished( const ThreadPool::thread_id_t& id ) const
+ThreadPool::wait_ids_struct::~wait_ids_struct()
+{
+    d_cv_pool.put( d_wait_event );
+    delete[] d_finished;
+    delete[] d_ids;
+}
+void ThreadPool::wait_ids_struct::id_finished( const ThreadPool::thread_id_t &id ) const
 {
     int index = find_id( d_N, d_ids, id );
     if ( index >= 0 ) {
         d_finished[index] = true;
-        int N_finished = 0;
-        for (int i=0; i<d_N; i++)
-            N_finished += d_finished[i] ? 1:0;
+        int N_finished    = 0;
+        for ( int i = 0; i < d_N; i++ )
+            N_finished += d_finished[i] ? 1 : 0;
         if ( N_finished >= d_wait ) {
-            *d_ptr = nullptr;
+            d_N    = 0;
             d_wait = 0;
-            d_N = 0;
+            AtomicOperations::atomic_compare_and_swap(
+                (void *volatile *) d_ptr, (void *) *d_ptr, nullptr );
             d_wait_event->notify_all();
         }
     }
 }
 bool ThreadPool::wait_ids_struct::wait_for( double seconds )
 {
-    for (int i=0; i<d_N; i++) {
+    for ( int i = 0; i < d_N; i++ ) {
         if ( d_ids[i].finished() )
             d_finished[i] = true;
     }
     auto t1 = std::chrono::high_resolution_clock::now();
     while ( true ) {
         int N_finished = 0;
-        for (int i=0; i<d_N; i++)
-            N_finished += d_finished[i] ? 1:0;
-        if ( N_finished>=d_wait || d_N==0 ) {
+        for ( int i = 0; i < d_N; i++ )
+            N_finished += d_finished[i] ? 1 : 0;
+        if ( N_finished >= d_wait || d_N == 0 ) {
             *d_ptr = nullptr;
             d_wait = 0;
-            d_N = 0;
+            d_N    = 0;
             break;
         }
         auto t2 = std::chrono::high_resolution_clock::now();
-        if ( 1e-6*std::chrono::duration_cast<std::chrono::microseconds>(t2-t1).count() > seconds )
+        if ( 1e-6 * std::chrono::duration_cast<std::chrono::microseconds>( t2 - t1 ).count() >
+             seconds )
             return false;
-        d_wait_event->wait_for(1e-5);
+        d_wait_event->wait_for( 1e-5 );
     }
     return true;
 }
 
 
 /******************************************************************
-* templated quicksort routine                                     *
-******************************************************************/
-template <class T>
+ * templated quicksort routine                                     *
+ ******************************************************************/
+template<class T>
 void quicksort( int n, T *arr )
 {
     if ( n <= 1 )
@@ -1154,7 +1203,7 @@ void quicksort( int n, T *arr )
     jstack = 0;
     l      = 0;
     ir     = n - 1;
-    while ( 1 ) {
+    while ( true ) {
         if ( ir - l < 7 ) { // Insertion sort when subarray small enough.
             for ( j = l + 1; j <= ir; j++ ) {
                 a    = arr[j];
@@ -1231,8 +1280,8 @@ void quicksort( int n, T *arr )
 
 
 /************************************************************************
-* Function to find the id in a sorted vector                            *
-************************************************************************/
+ * Function to find the id in a sorted vector                            *
+ ************************************************************************/
 inline int find_id( int n, const ThreadPool::thread_id_t *x, const ThreadPool::thread_id_t &id )
 {
     if ( n == 0 )
@@ -1243,7 +1292,7 @@ inline int find_id( int n, const ThreadPool::thread_id_t *x, const ThreadPool::t
     if ( id < x[0] )
         return -1;
     if ( id == x[n - 1] )
-        return n-1;
+        return n - 1;
     if ( id > x[n - 1] )
         return -1;
     // Perform the search
@@ -1264,13 +1313,13 @@ inline int find_id( int n, const ThreadPool::thread_id_t *x, const ThreadPool::t
 
 
 /************************************************************************
-* Function to add dependencies to the work item                         *
-* Note: when expanding the size of d_ids, we need to allocate space for *
-* one extra entry for a spinlock.                                       *
-************************************************************************/
+ * Function to add dependencies to the work item                         *
+ * Note: when expanding the size of d_ids, we need to allocate space for *
+ * one extra entry for a spinlock.                                       *
+ ************************************************************************/
 void ThreadPool::WorkItem::add_dependencies( size_t N, const ThreadPool::thread_id_t *ids )
 {
-    if ( d_state!=0 ) {
+    if ( d_state != 0 ) {
         // The item has already been added to the threadpool,
         // we are not allowed to add dependencies
         throw std::logic_error(
@@ -1291,9 +1340,9 @@ void ThreadPool::WorkItem::add_dependencies( size_t N, const ThreadPool::thread_
         for ( size_t i = 0; i < d_N_ids; i++ )
             const_cast<thread_id_t &>( ids[i] ).swap( tmp[i] );
         delete[] tmp;
-        d_size = N2;
-        int* lock = reinterpret_cast<int*>(&d_ids[d_size-1]);
-        *lock = 0;
+        d_size     = N2;
+        auto *lock = reinterpret_cast<int *>( &d_ids[d_size - 1] );
+        *lock      = 0;
     }
     const ThreadPool::thread_id_t id0;
     for ( size_t i = 0; i < N; i++ ) {
diff --git a/threadpool/thread_pool.h b/threadpool/thread_pool.h
index db3eec9d..eff12433 100644
--- a/threadpool/thread_pool.h
+++ b/threadpool/thread_pool.h
@@ -3,53 +3,25 @@
 // PARTICULAR PURPOSE.
 #ifndef included_AtomicModelThreadPool
 #define included_AtomicModelThreadPool
+
+#include <condition_variable>
 #include <iostream>
 #include <map>
+#include <mutex>
 #include <stdarg.h>
 #include <stdexcept>
 #include <stdio.h>
 #include <string.h>
+#include <thread>
 #include <typeinfo>
 #include <vector>
-#include <mutex>
-#include <thread>
-#include <condition_variable>
 
 
 #include "threadpool/atomic_helpers.h"
 #include "threadpool/atomic_list.h"
 
 
-// Choose the OS
-#if defined( WIN32 ) || defined( _WIN32 ) || defined( WIN64 ) || defined( _WIN64 )
-    // Using windows
-    #define USE_WINDOWS
-#elif defined( __APPLE__ )
-    // Using MAC
-    #define USE_MAC
-#elif defined( __linux ) || defined( __unix ) || defined( __posix )
-    // Using linux
-    #define USE_LINUX
-#else
-    #error Unknown OS
-#endif
-
-
-// Set some definitions
-#define MAX_NUM_THREADS 128     // The maximum number of threads (must be a multiple of 64)
-#define MAX_QUEUED 1024         // The maximum number of items in the work queue at any moment
-#define MAX_WAIT 16             // The maximum number of active waits at any given time
-#define MAX_WAIT_TIME_DEBUG 600 // The maximum time in a wait command before printing a warning message
-
-#define PROFILE_THREADPOOL_PERFORMANCE 0    // Add profile timers to the threadpool (default is 0)
-#define MONITOR_THREADPOOL_PERFORMANCE 0    // Add detailed performance counters (default is 0)
-
-
-// Check the c++ std
-#if CXX_STD==98
-#error Thread pool class requires c++11 or newer
-#endif
-
+// clang-format off
 
 
 /** \class ThreadPool
@@ -75,6 +47,13 @@
  */
 class ThreadPool
 {
+public:
+    ///// Set some global properties
+    constexpr static int MAX_NUM_THREADS = 128; // The maximum number of threads (must be a multiple of 64)
+    constexpr static int MAX_QUEUED = 1024;     // The maximum number of items in the work queue at any moment
+    constexpr static int MAX_WAIT = 16;         // The maximum number of active waits at any given time
+    constexpr static bool PROFILE_THREADPOOL_PERFORMANCE = false; // Add profile timers to the threadpool
+    constexpr static bool MONITOR_THREADPOOL_PERFORMANCE = false; // Add detailed performance counters
 
 public:
     ///// Member classes
@@ -102,7 +81,7 @@ public:
         inline thread_id_t( volatile thread_id_t &&rhs );
         inline thread_id_t &operator=( const thread_id_t &rhs ) volatile;
         inline thread_id_t &operator=( volatile thread_id_t &&rhs ) volatile;
-#ifndef USE_WINDOWS
+#if !defined( WIN32 ) && !defined( _WIN32 ) && !defined( WIN64 ) && !defined( _WIN64 )
         inline thread_id_t( const thread_id_t &rhs );
         inline thread_id_t &operator=( thread_id_t &&rhs );
         inline thread_id_t &operator=( const thread_id_t &rhs );
@@ -245,7 +224,7 @@ public:
         //! Run the work item
         virtual void run() override = 0;
         //! Will the routine return a result
-        virtual bool has_result() const override = 0;
+        virtual bool has_result() const override final { return !std::is_same<return_type,void>::value; }
         //! Return the results
         return_type get_results() const { return d_result; }
         //! Virtual destructor
@@ -353,10 +332,12 @@ public:
      *   in the ThreadPool without checking the existing work unless the desired number of
      *   threads is 0.  In this case, the function will wait for all work items to finish
      *   before deleting the existing work threads.
+
      *   Member threads may not call this function.
      * @param N                 The desired number of worker threads
      * @param affinity          The affinity scheduler to use:
      *                          none - Let the OS handle the affinities (default)
+
      *                          independent - Give each thread an independent set of processors
      * @param procs             The processors to use (defaults to the process affinitiy list)
      */
@@ -368,6 +349,16 @@ public:
     }
 
 
+    /*!
+     * \brief   Function to set the maximum wait time
+     * \details  This function sets the maximum time the thread pool will
+     *    wait before warning about a possible hung thread.
+     *    Default is to wait 10 minutes.
+     * @param time              The number of seconds to wait (seconds)
+     */
+    inline void setMaxWaitTimeDebug( const int time ) { d_max_wait_time = time; }
+
+
     /*!
      * \brief   Function to return the current thread number
      * \details  This function will return the thread number of current active thread.
@@ -400,16 +391,14 @@ public:
      * @param id                The id of the work item
      */
     template <class return_type>
-    inline return_type getFunctionRet( const thread_id_t &id ) const;
+    static inline return_type getFunctionRet( const thread_id_t &id );
 
 
     /*!
      * \brief   Function to create a work item
      * \details This function creates a work item that can be added to the queue
-     * @param work              Pointer to the work item to add
-     *                          Note that the threadpool will automatically destroy the item when
-     * finished
-     * @param priority          A value indicating the priority of the work item (0-default)
+     * @param routine           Function to call from the thread pool
+     * @param args              Function arguments to pass
      */
     template <class Ret, class... Args>
     static inline WorkItem* createWork( Ret( *routine )( Args... ), Args... args );
@@ -505,6 +494,7 @@ public:
      *   If successful it returns the indicies of the finished work items (the index in the array ids).
      *   Note: any thread may call this routine, but they will block until finished.
      *   For worker threads this may eventually lead to a deadlock.
+     * @param N_wait            Number of work items to wait for
      * @param ids               Vector of work items to wait for
      */
     inline std::vector<int> wait_some( int N_wait, const std::vector<thread_id_t> &ids ) const;
@@ -552,6 +542,69 @@ public:
     //! Return the number of items queued
     int N_queued( ) const { return d_queue_list.size(); }
 
+
+    //! Set the error handler for threads
+    void setErrorHandler( std::function<void(const std::string&)> fun );
+
+
+public: // Static interface
+
+    /*!
+     * \brief   Function to return the number of work threads
+     * \details This function returns the number of threads in the thread pool,
+     *    or 0 if the thread pool is empty or does not exist
+     * @param tpool         Threadpool to add work to (may be null)
+     */
+    static inline int numThreads( const ThreadPool* tpool ) { return tpool ? tpool->getNumThreads() : 0; }
+
+    /*!
+     * \brief   Function to add a work item
+     * \details This function adds a work item to the queue
+     *   Note: any thread may call this routine.
+     * @param tpool         Threadpool to add work to (may be null)
+     * @param work          Pointer to the work item to add
+     *                      Note that the threadpool will automatically destroy the item when finished
+     * @param priority      A value indicating the priority of the work item (0-default)
+     */
+    static inline thread_id_t add_work( ThreadPool* tpool, ThreadPool::WorkItem *work, int priority = 0 );
+
+
+    /*!
+     * \brief   Function to add multiple work items
+     * \details This function adds multiple work item to the queue
+     *   Note: any thread may call this routine.
+     * @param tpool         Threadpool to add work to (may be null)
+     * @param work          Vector of pointers to the work items to add
+     *                      Note that the threadpool will automatically destroy the item when finished
+     * @param priority      Vector of values indicating the priority of the work items
+     */
+    static inline std::vector<thread_id_t> add_work( ThreadPool* tpool, const std::vector<ThreadPool::WorkItem *> &work,
+        const std::vector<int> &priority = std::vector<int>() );
+
+
+    /*!
+     * \brief   Function to wait until all of the given work items have finished their work
+     * \details This is the function waits for all given of the work items to finish.  It returns 0
+     * if successful.
+     *   Note: any thread may call this routine, but they will block until finished.
+     *   For worker threads this may eventually lead to a deadlock.
+     * @param tpool         Threadpool containing work (must match call to add_work)
+     * @param ids           Vector of work items to wait for
+     */
+    static inline int wait_all( const ThreadPool* tpool, const std::vector<thread_id_t> &ids );
+
+
+    /*!
+     * \brief   Function to wait until all work items in the thread pool have finished their work
+     * \details This function will wait until all work has finished.
+     *   Note: member threads may not call this function.
+     *   Only one non-member thread should call this routine at a time.
+     * @param tpool         Threadpool containing work (must match call to add_work)
+     */
+    static inline void wait_pool_finished( const ThreadPool* tpool ) { if ( tpool ) { tpool->wait_pool_finished(); } }
+
+
+
 private:
     typedef AtomicOperations::int32_atomic int32_atomic;
 
@@ -593,7 +646,7 @@ private:
       public:
         wait_ids_struct( size_t N, const ThreadPool::thread_id_t *ids, size_t N_wait,
             AtomicOperations::pool<condition_variable,128>& cv_pool, int N_wait_list, volatile wait_ids_struct **list );
-        ~wait_ids_struct( ) { d_cv_pool.put( d_wait_event ); delete [] d_finished; delete [] d_ids; }
+        ~wait_ids_struct( );
         void id_finished( const ThreadPool::thread_id_t& id ) const;
         bool wait_for( double seconds );
       private:
@@ -628,7 +681,10 @@ private:
     inline void add_work( const ThreadPool::thread_id_t& id );
 
     // Function to get a work item that has finished
-    WorkItem *getFinishedWorkItem( ThreadPool::thread_id_t id ) const;
+    static inline WorkItem *getFinishedWorkItem( const ThreadPool::thread_id_t& id )
+    {
+        return id.finished() ? id.work():nullptr;
+    }
 
     // This function provides a wrapper (needed for the threads)
     static inline void create_new_thread( ThreadPool *tpool, int id )
@@ -676,10 +732,13 @@ private:
     std::thread::id d_threadId[MAX_NUM_THREADS]; // Unique id for each thread
     queue_type d_queue_list;                // The work queue
     size_t d_NULL_TAIL;                     // Null data buffer to check memory bounds
+    int d_max_wait_time;                    // The maximum time in a wait command before printing a warning message
+    std::function<void(const std::string&)> d_errorHandler;
 };
 
 
 #include "threadpool/thread_pool.hpp"
 
 
+// clang-format on
 #endif
diff --git a/threadpool/thread_pool.hpp b/threadpool/thread_pool.hpp
index 60840ec1..a87860b3 100644
--- a/threadpool/thread_pool.hpp
+++ b/threadpool/thread_pool.hpp
@@ -23,7 +23,7 @@
  */
 #define TPOOL_TUPLE_TO_SEQ( t ) TPOOL_TUPLE_TO_SEQ_##II t
 #define TPOOL_TUPLE_TO_SEQ_II( a, ... ) a, ##__VA_ARGS__
-#ifdef USE_WINDOWS
+#if defined( WIN32 ) || defined( _WIN32 ) || defined( WIN64 ) || defined( _WIN64 )
 #define TPOOL_GET_PRIORITY( a, N, c, ... ) N
 #define TPOOL_ADD_WORK( TPOOL, FUNCTION, ARGS, ... )                                      \
     ThreadPool_add_work( TPOOL, TPOOL_GET_PRIORITY( 0, __VA_ARGS__, 0, 0 ) + 0, FUNCTION, \
@@ -40,35 +40,35 @@
 // \cond HIDDEN_SYMBOLS
 
 
-
 // Unpack a tuple and call a function
-template <int...>
+template<int...>
 struct index_tuple {
 };
-template <int I, typename IndexTuple, typename... Types>
+template<int I, typename IndexTuple, typename... Types>
 struct make_indexes_impl;
-template <int I, int... Indexes, typename T, typename... Types>
+template<int I, int... Indexes, typename T, typename... Types>
 struct make_indexes_impl<I, index_tuple<Indexes...>, T, Types...> {
     typedef typename make_indexes_impl<I + 1, index_tuple<Indexes..., I>, Types...>::type type;
 };
-template <int I, int... Indexes>
+template<int I, int... Indexes>
 struct make_indexes_impl<I, index_tuple<Indexes...>> {
     typedef index_tuple<Indexes...> type;
 };
-template <typename... Types>
+template<typename... Types>
 struct make_indexes : make_indexes_impl<0, index_tuple<>, Types...> {
 };
-template <class Ret, class... Args, int... Indexes>
-inline Ret apply_helper( Ret ( *pf )( Args... ), index_tuple<Indexes...>, std::tuple<Args...> &&tup )
+template<class Ret, class... Args, int... Indexes>
+inline Ret apply_helper(
+    Ret ( *pf )( Args... ), index_tuple<Indexes...>, std::tuple<Args...> &&tup )
 {
     return pf( std::forward<Args>( std::get<Indexes>( tup ) )... );
 }
-template <class Ret, class... Args>
+template<class Ret, class... Args>
 inline Ret apply( Ret ( *pf )( Args... ), const std::tuple<Args...> &tup )
 {
     return apply_helper( pf, typename make_indexes<Args...>::type(), std::tuple<Args...>( tup ) );
 }
-template <class Ret, class... Args>
+template<class Ret, class... Args>
 inline Ret apply( Ret ( *pf )( Args... ), std::tuple<Args...> &&tup )
 {
     return apply_helper(
@@ -77,21 +77,21 @@ inline Ret apply( Ret ( *pf )( Args... ), std::tuple<Args...> &&tup )
 
 
 // Specialization for no return argument
-template <>
+template<>
 class ThreadPool::WorkItemRet<void> : public ThreadPool::WorkItem
 {
 public:
     virtual void run() override = 0;
-    virtual bool has_result() const override { return false; }
     void get_results() {}
     virtual ~WorkItemRet() {}
+    virtual bool has_result() const override final { return false; }
 };
 
 
 // Final class for the work item
-template <class Ret, class... Args>
+template<class Ret, class... Args>
 class WorkItemFull;
-template <class... Args>
+template<class... Args>
 class WorkItemFull<void, Args...> : public ThreadPool::WorkItemRet<void>
 {
 private:
@@ -104,14 +104,10 @@ public:
         : ThreadPool::WorkItemRet<void>(), routine( routine2 ), args( ts... )
     {
     }
-    virtual void run() override
-    {
-        apply( routine, args );
-    }
-    virtual bool has_result() const override { return false; }
+    virtual void run() override { apply( routine, args ); }
     virtual ~WorkItemFull() {}
 };
-template <class Ret, class... Args>
+template<class Ret, class... Args>
 class WorkItemFull : public ThreadPool::WorkItemRet<Ret>
 {
 private:
@@ -124,62 +120,60 @@ public:
         : ThreadPool::WorkItemRet<Ret>(), routine( routine2 ), args( ts... )
     {
     }
-    virtual void run() override
-    {
-        this->d_result = apply( routine, args );
-    }
-    virtual bool has_result() const override { return true; }
+    virtual void run() override { this->d_result = apply( routine, args ); }
     virtual ~WorkItemFull() {}
 };
 
 
 // Functions to add work to the thread pool
-template <class Ret, class... Ts>
+template<class Ret, class... Ts>
 inline ThreadPool::thread_id_t ThreadPool_add_work(
     ThreadPool *tpool, int priority, Ret ( *routine )( Ts... ), Ts... ts )
 {
-    ThreadPool::WorkItem *work = new WorkItemFull<Ret, Ts...>( routine, ts... );
-    return tpool->add_work( work, priority );
+    auto work = new WorkItemFull<Ret, Ts...>( routine, ts... );
+    return ThreadPool::add_work( tpool, work, priority );
 }
-template <class Ret>
+template<class Ret>
 inline ThreadPool::thread_id_t ThreadPool_add_work(
     ThreadPool *tpool, int priority, Ret ( *routine )(), void * )
 {
-    ThreadPool::WorkItem *work = new WorkItemFull<Ret>( routine );
-    return tpool->add_work( work, priority );
+    auto work = new WorkItemFull<Ret>( routine );
+    return ThreadPool::add_work( tpool, work, priority );
 }
-template <class Ret, class... Args>
-inline ThreadPool::WorkItem* ThreadPool::createWork( Ret( *routine )( Args... ), Args... args )
+template<class Ret, class... Args>
+inline ThreadPool::WorkItem *ThreadPool::createWork( Ret ( *routine )( Args... ), Args... args )
 {
     return new WorkItemFull<Ret, Args...>( routine, args... );
 }
 
 
 /******************************************************************
-* Function to get the returned function value                     *
-******************************************************************/
-template <class T> inline constexpr T zeroConstructor();
-template<> inline constexpr bool zeroConstructor<bool>( ) { return false; }
-template<> inline constexpr char zeroConstructor<char>( ) { return 0; }
-template<> inline constexpr unsigned char zeroConstructor<unsigned char>( ) { return 0; }
-template<> inline constexpr int zeroConstructor<int>( ) { return 0; }
-template<> inline constexpr unsigned int zeroConstructor<unsigned int>( ) { return 0; }
-template<> inline constexpr long zeroConstructor<long>( ) { return 0; }
-template<> inline constexpr unsigned long zeroConstructor<unsigned long>( ) { return 0; }
-template<> inline constexpr float zeroConstructor<float>( ) { return 0; }
-template<> inline constexpr double zeroConstructor<double>( ) { return 0; }
-template <class T> inline constexpr T zeroConstructor() { return T(); }
-template <class Ret>
-inline Ret ThreadPool::getFunctionRet( const ThreadPool::thread_id_t &id ) const
+ * Function to get the returned function value                     *
+ ******************************************************************/
+// clang-format off
+template<class T> inline constexpr T zeroConstructor();
+template<> inline constexpr bool zeroConstructor<bool>() { return false; }
+template<> inline constexpr char zeroConstructor<char>() { return 0; }
+template<> inline constexpr unsigned char zeroConstructor<unsigned char>() { return 0; }
+template<> inline constexpr int zeroConstructor<int>() { return 0; }
+template<> inline constexpr unsigned int zeroConstructor<unsigned int>() { return 0; }
+template<> inline constexpr long zeroConstructor<long>() { return 0; }
+template<> inline constexpr unsigned long zeroConstructor<unsigned long>() { return 0; }
+template<> inline constexpr float zeroConstructor<float>() { return 0; }
+template<> inline constexpr double zeroConstructor<double>() { return 0; }
+template<class T> inline constexpr T zeroConstructor() { return T(); }
+template<class Ret>
+inline Ret ThreadPool::getFunctionRet( const ThreadPool::thread_id_t &id )
 {
-    WorkItemRet<Ret> *work = dynamic_cast<WorkItemRet<Ret>*>( getFinishedWorkItem( id ) );
+    auto work = dynamic_cast<WorkItemRet<Ret> *>( getFinishedWorkItem( id ) );
     return work == nullptr ? zeroConstructor<Ret>() : work->get_results();
 }
+// clang-format on
 
 
 /******************************************************************
-* Inline functions to wait for the work items to finish           *
-******************************************************************/
+ * Inline functions to wait for the work items to finish           *
+ ******************************************************************/
 inline int ThreadPool::wait( ThreadPool::thread_id_t id ) const
 {
     bool finished;
@@ -218,7 +212,7 @@ inline int ThreadPool::wait_any( const std::vector<thread_id_t> &ids ) const
 }
 inline int ThreadPool::wait_all( size_t N_work, const ThreadPool::thread_id_t *ids ) const
 {
-    if ( N_work==0 )
+    if ( N_work == 0 )
         return 0;
     auto finished = new bool[N_work];
     wait_some( N_work, ids, N_work, finished );
@@ -234,25 +228,32 @@ inline int ThreadPool::wait_all( const std::vector<thread_id_t> &ids ) const
     delete[] finished;
     return 0;
 }
-inline std::vector<int> ThreadPool::wait_some( int N_wait, const std::vector<thread_id_t> &ids ) const
+inline int ThreadPool::wait_all( const ThreadPool *tpool, const std::vector<thread_id_t> &ids )
 {
-    auto finished = new bool[ids.size()];
+    if ( tpool )
+        return tpool->wait_all( ids );
+    return ids.size();
+}
+inline std::vector<int> ThreadPool::wait_some(
+    int N_wait, const std::vector<thread_id_t> &ids ) const
+{
+    auto finished  = new bool[ids.size()];
     int N_finished = wait_some( ids.size(), ids.data(), N_wait, finished );
-    std::vector<int> index(N_finished,-1);
-    for ( size_t i=0, j=0; i < ids.size(); i++ ) {
+    std::vector<int> index( N_finished, -1 );
+    for ( size_t i = 0, j = 0; i < ids.size(); i++ ) {
         if ( finished[i] ) {
             index[j] = i;
             j++;
         }
     }
-    delete [] finished;
+    delete[] finished;
     return index;
 }
 
 
 /******************************************************************
-* Functions to add work items.                                    *
-******************************************************************/
+ * Functions to add work items.                                    *
+ ******************************************************************/
 inline ThreadPool::thread_id_t ThreadPool::add_work( WorkItem *work, int priority )
 {
     ThreadPool::thread_id_t id;
@@ -280,11 +281,37 @@ inline std::vector<ThreadPool::thread_id_t> ThreadPool::add_work(
         delete[] priority2;
     return ids;
 }
+inline ThreadPool::thread_id_t ThreadPool::add_work(
+    ThreadPool *tpool, ThreadPool::WorkItem *work, int priority )
+{
+    ThreadPool::thread_id_t id;
+    if ( tpool ) {
+        id = tpool->add_work( work, priority );
+    } else {
+        id.reset( priority, std::rand(), work );
+        work->d_state = 2;
+        work->run();
+        work->d_state = 3;
+    }
+    return id;
+}
+inline std::vector<ThreadPool::thread_id_t> ThreadPool::add_work( ThreadPool *tpool,
+    const std::vector<ThreadPool::WorkItem *> &work, const std::vector<int> &priority )
+{
+    if ( tpool ) {
+        return tpool->add_work( work, priority );
+    } else {
+        std::vector<ThreadPool::thread_id_t> ids( work.size() );
+        for ( size_t i = 0; i < work.size(); i++ )
+            ids[i] = add_work( tpool, work[i], priority[i] );
+        return ids;
+    }
+}
 
 
 /******************************************************************
-* Class functions to for the thread id                            *
-******************************************************************/
+ * Class functions to for the thread id                            *
+ ******************************************************************/
 inline ThreadPool::thread_id_t::thread_id_t()
     : d_id( nullThreadID ), d_count( NULL ), d_work( NULL )
 {
@@ -326,7 +353,7 @@ inline ThreadPool::thread_id_t::thread_id_t( const volatile ThreadPool::thread_i
     if ( d_count != NULL )
         AtomicOperations::atomic_increment( d_count );
 }
-#ifndef USE_WINDOWS
+#if !defined( WIN32 ) && !defined( _WIN32 ) && !defined( WIN64 ) && !defined( _WIN64 )
 inline ThreadPool::thread_id_t::thread_id_t( const thread_id_t &rhs )
     : d_id( rhs.d_id ), d_count( rhs.d_count ), d_work( rhs.d_work )
 {
@@ -417,8 +444,8 @@ inline uint64_t ThreadPool::thread_id_t::createId( int priority, uint64_t local_
     if ( priority >= 0 )
         tmp2 |= 0x80;
     uint64_t id = tmp2;
-    id = ( id << 56 ) + local_id;
-    return id;    
+    id          = ( id << 56 ) + local_id;
+    return id;
 }
 inline void ThreadPool::thread_id_t::reset( int priority, uint64_t local_id, void *work )
 {
@@ -435,8 +462,8 @@ inline void ThreadPool::thread_id_t::reset( int priority, uint64_t local_id, voi
     d_count = nullptr;
     d_work  = nullptr;
     if ( work != nullptr ) {
-        d_work = work;
-        d_count = &(reinterpret_cast<WorkItem *>( work )->d_count);
+        d_work   = work;
+        d_count  = &( reinterpret_cast<WorkItem *>( work )->d_count );
         *d_count = 1;
     }
 }
@@ -473,7 +500,7 @@ inline bool ThreadPool::thread_id_t::ready() const
     bool ready = true;
     if ( !isNull() ) {
         auto tmp = work();
-        for (size_t i=0; i<tmp->d_N_ids; i++)
+        for ( size_t i = 0; i < tmp->d_N_ids; i++ )
             ready = ready && tmp->d_ids[i].finished();
     }
     return ready;
@@ -481,21 +508,22 @@ inline bool ThreadPool::thread_id_t::ready() const
 
 
 /******************************************************************
-* This function checks if the id is valid                         *
-******************************************************************/
+ * This function checks if the id is valid                         *
+ ******************************************************************/
 inline bool ThreadPool::isValid( const ThreadPool::thread_id_t &id ) const
 {
-    static_assert( sizeof(atomic_64)==8, "atomic_64 must be a 64-bit integer" );
+    static_assert( sizeof( atomic_64 ) == 8, "atomic_64 must be a 64-bit integer" );
     uint64_t local_id = id.getLocalID();
     uint64_t next_id  = d_id_assign - 1;
-    return local_id!=0 && id.initialized() && local_id<=thread_id_t::maxThreadID && local_id>next_id;
+    return local_id != 0 && id.initialized() && local_id <= thread_id_t::maxThreadID &&
+           local_id > next_id;
 }
 
 
 /******************************************************************
-* Function to get the thread number                               *
-* (-1 if it is not a member thread)                               *
-******************************************************************/
+ * Function to get the thread number                               *
+ * (-1 if it is not a member thread)                               *
+ ******************************************************************/
 inline int ThreadPool::getThreadNumber() const
 {
     std::thread::id id = std::this_thread::get_id();