LBPM/common/MPI.cpp

// This file impliments a wrapper class for MPI functions

#include "common/MPI.h"
#include "common/Utilities.h"

#include "ProfilerApp.h"
#include "StackTrace/ErrorHandlers.h"
#include "StackTrace/StackTrace.h"

// Include all other headers
#include <algorithm>
#include <chrono>
#include <climits>
#include <cstdio>
#include <cstdlib>
#include <cstring>
#include <limits>
#include <random>
#include <stdexcept>
#include <thread>
#include <typeinfo>


// Include OS specific headers
#undef USE_WINDOWS
#undef USE_LINUX
#undef USE_MAC
#if defined( WIN32 ) || defined( _WIN32 ) || defined( WIN64 ) || defined( _WIN64 )
// We are using windows
#define USE_WINDOWS
#include <process.h>
#include <windows.h>
#define sched_yield() Sleep( 0 )
#elif defined( __APPLE__ )
// Using MAC
#define USE_MAC
#include <sched.h>
#elif defined( __linux ) || defined( __linux__ ) || defined( __unix ) || defined( __posix )
// We are using linux
#define USE_LINUX
#include <sched.h>
#include <unistd.h>
#else
#error Unknown OS
#endif


// Convience defines
#define MPI_ERROR ERROR
#define MPI_ASSERT ASSERT
#define MPI_INSIST INSIST
#define MPI_WARNING WARNING
#define MPI_CLASS_COMM_NULL MPI_COMM_NULL
#define MPI_CLASS_COMM_SELF MPI_COMM_SELF
#define MPI_CLASS_COMM_WORLD MPI_COMM_WORLD


// Global variable to track create new unique comms (dup and split)
#ifndef USE_MPI
MPI_Comm uniqueGlobalComm = 11;
#endif


#if defined( USE_SAMRAI ) && defined( USE_PETSC ) && !defined( USE_MPI )
int MPI_REQUEST_NULL  = 3;
int MPI_ERR_IN_STATUS = 4;
#endif


namespace Utilities {


// Some special structs to work with MPI
#ifdef USE_MPI
struct IntIntStruct {
    int j;
    int i;
};
struct LongIntStruct {
    long int j;
    int i;
};
struct FloatIntStruct {
    float f;
    int i;
};
struct DoubleIntStruct {
    double d;
    int i;
};
#endif


// Initialized the static member variables
volatile unsigned int MPI_CLASS::N_MPI_Comm_created   = 0;
volatile unsigned int MPI_CLASS::N_MPI_Comm_destroyed = 0;
short MPI_CLASS::profile_level                        = 127;


// Define a type for use with size_t
#ifdef USE_MPI
static MPI_Datatype MPI_SIZE_T = 0x0;
static MPI_Datatype getSizeTDataType()
{
    int size_int, size_long, size_longlong, size_longlong2;
    MPI_Type_size( MPI_UNSIGNED, &size_int );
    MPI_Type_size( MPI_UNSIGNED_LONG, &size_long );
    MPI_Type_size( MPI_UNSIGNED_LONG_LONG, &size_longlong );
    MPI_Type_size( MPI_LONG_LONG_INT, &size_longlong2 );
    if ( sizeof( size_t ) == size_int ) {
        return MPI_UNSIGNED;
    } else if ( sizeof( size_t ) == size_long ) {
        return MPI_UNSIGNED_LONG;
    } else if ( sizeof( size_t ) == size_longlong ) {
        return MPI_UNSIGNED_LONG_LONG;
    } else if ( sizeof( size_t ) == size_longlong2 ) {
        MPI_WARNING( "Using signed long long datatype for size_t in MPI" );
        return MPI_LONG_LONG_INT; // Note: this is not unsigned
    } else {
        MPI_ERROR( "No suitable datatype found" );
    }
    return 0;
}
#endif


// Static data for asyncronous communication without MPI
// Note: these routines may not be thread-safe yet
#ifndef USE_MPI
static const int mpi_max_tag = 0x003FFFFF;
struct Isendrecv_struct {
    const char *data; // Pointer to data
    int status;       // Status: 1-sending, 2-recieving
};
std::map<MPI_Request, Isendrecv_struct> global_isendrecv_list;
static MPI_Request getRequest( MPI_Comm comm, int tag )
{
    MPI_ASSERT( tag >= 0 && tag <= mpi_max_tag );
    // Use hashing function: 2^64*0.5*(sqrt(5)-1)
    uint64_t a    = static_cast<uint8_t>( comm ) * 0x9E3779B97F4A7C15;
    uint64_t b    = static_cast<uint8_t>( tag ) * 0x9E3779B97F4A7C15;
    uint64_t hash = a ^ b;
    MPI_Request request;
    memcpy( &request, &hash, sizeof( MPI_Request ) );
    return request;
}
#endif


// Check the mpi error code
#ifdef USE_MPI
inline void check_MPI( int error )
{
    if ( error != MPI_SUCCESS )
        MPI_ERROR( "Error calling MPI routine" );
}
#endif


/******************************************************************
 * Some helper functions to convert between signed/unsigned types  *
 ******************************************************************/
DISABLE_WARNINGS
static inline constexpr unsigned int offset_int()
{
    return ~static_cast<unsigned int>( std::numeric_limits<int>::min() ) + 1;
}
static inline constexpr unsigned long int offset_long()
{
    return ~static_cast<long int>( std::numeric_limits<long int>::min() ) + 1;
}
static inline constexpr unsigned long long int offset_long_long()
{
    return ~static_cast<long long int>( std::numeric_limits<long long int>::min() ) + 1;
}
ENABLE_WARNINGS
static inline unsigned int signed_to_unsigned( int x )
{
    const auto offset = offset_int();
    return ( x >= 0 ) ? static_cast<unsigned int>( x ) + offset :
                        offset - static_cast<unsigned int>( -x );
}
static inline unsigned long int signed_to_unsigned( long int x )
{
    const auto offset = offset_long();
    return ( x >= 0 ) ? static_cast<unsigned long int>( x ) + offset :
                        offset - static_cast<unsigned long int>( -x );
}
static inline unsigned long long int signed_to_unsigned( long long int x )
{
    const auto offset = offset_long_long();
    return ( x >= 0 ) ? static_cast<unsigned long long int>( x ) + offset :
                        offset - static_cast<unsigned long long int>( -x );
}
static inline int unsigned_to_signed( unsigned int x )
{
    const auto offset = offset_int();
    return ( x >= offset ) ? static_cast<int>( x - offset ) : -static_cast<int>( offset - x );
}
static inline long int unsigned_to_signed( unsigned long int x )
{
    const auto offset = offset_long();
    return ( x >= offset ) ? static_cast<long int>( x - offset ) :
                             -static_cast<long int>( offset - x );
}
static inline long long int unsigned_to_signed( unsigned long long int x )
{
    const auto offset = offset_long_long();
    return ( x >= offset ) ? static_cast<long long int>( x - offset ) :
                             -static_cast<long long int>( offset - x );
}


/************************************************************************
 *  Get the MPI version                                                  *
 ************************************************************************/
std::array<int, 2> MPI_CLASS::version()
{
#ifdef USE_MPI
    int MPI_version;
    int MPI_subversion;
    MPI_Get_version( &MPI_version, &MPI_subversion );
    return { MPI_version, MPI_subversion };
#else
    return { 0, 0 };
#endif
}
std::string MPI_CLASS::info()
{
#ifdef USE_MPI
#if MPI_VERSION >= 3
    int MPI_version_length = 0;
    char MPI_version_string[MPI_MAX_LIBRARY_VERSION_STRING];
    MPI_Get_library_version( MPI_version_string, &MPI_version_length );
    if ( MPI_version_length > 0 ) {
        std::string MPI_info( MPI_version_string, MPI_version_length );
        size_t pos = MPI_info.find( '\n' );
        while ( pos != std::string::npos ) {
            MPI_info.insert( pos + 1, "   " );
            pos = MPI_info.find( '\n', pos + 1 );
        }
        return MPI_info;
    }
#endif
    auto tmp = version();
    return std::to_string( tmp[0] ) + "." + std::to_string( tmp[0] );
#else
    return std::string();
#endif
}


/************************************************************************
 *  Functions to get/set the process affinities                          *
 ************************************************************************/
int MPI_CLASS::getNumberOfProcessors() { return std::thread::hardware_concurrency(); }
std::vector<int> MPI_CLASS::getProcessAffinity()
{
    std::vector<int> procs;
#ifdef USE_LINUX
    cpu_set_t mask;
    int error = sched_getaffinity( getpid(), sizeof( cpu_set_t ), &mask );
    if ( error != 0 )
        MPI_ERROR( "Error getting process affinity" );
    for ( int i = 0; i < (int) sizeof( cpu_set_t ) * CHAR_BIT; i++ ) {
        if ( CPU_ISSET( i, &mask ) )
            procs.push_back( i );
    }
#elif defined( USE_MAC )
    // MAC does not support getting or setting the affinity
    printf( "Warning: MAC does not support getting the process affinity\n" );
    procs.clear();
#elif defined( USE_WINDOWS )
    HANDLE hProc = GetCurrentProcess();
    size_t procMask;
    size_t sysMask;
    PDWORD_PTR procMaskPtr = reinterpret_cast<PDWORD_PTR>( &procMask );
    PDWORD_PTR sysMaskPtr  = reinterpret_cast<PDWORD_PTR>( &sysMask );
    GetProcessAffinityMask( hProc, procMaskPtr, sysMaskPtr );
    for ( int i = 0; i < (int) sizeof( size_t ) * CHAR_BIT; i++ ) {
        if ( ( procMask & 0x1 ) != 0 )
            procs.push_back( i );
        procMask >>= 1;
    }
#else
#error Unknown OS
#endif
    return procs;
}
void MPI_CLASS::setProcessAffinity( const std::vector<int> &procs )
{
#ifdef USE_LINUX
    cpu_set_t mask;
    CPU_ZERO( &mask );
    for ( auto cpu : procs )
        CPU_SET( cpu, &mask );
    int error = sched_setaffinity( getpid(), sizeof( cpu_set_t ), &mask );
    if ( error != 0 )
        MPI_ERROR( "Error setting process affinity" );
#elif defined( USE_MAC )
    // MAC does not support getting or setting the affinity
    NULL_USE( procs );
#elif defined( USE_WINDOWS )
    DWORD mask = 0;
    for ( size_t i = 0; i < procs.size(); i++ )
        mask |= ( (DWORD) 1 ) << procs[i];
    HANDLE hProc = GetCurrentProcess();
    SetProcessAffinityMask( hProc, mask );
#else
#error Unknown OS
#endif
}


/************************************************************************
 *  Function to check if MPI is active                                   *
 ************************************************************************/
bool MPI_CLASS::MPI_active()
{
#ifdef USE_MPI
    int initialized = 0, finalized = 0;
    MPI_Initialized( &initialized );
    MPI_Finalized( &finalized );
    return initialized != 0 && finalized == 0;
#else
    return true;
#endif
}
MPI_CLASS::ThreadSupport MPI_CLASS::queryThreadSupport()
{
#ifdef USE_MPI
    int provided = 0;
    MPI_Query_thread( &provided );
    if ( provided == MPI_THREAD_SINGLE )
        return ThreadSupport::SINGLE;
    if ( provided == MPI_THREAD_FUNNELED )
        return ThreadSupport::FUNNELED;
    if ( provided == MPI_THREAD_SERIALIZED )
        return ThreadSupport::SERIALIZED;
    if ( provided == MPI_THREAD_MULTIPLE )
        return ThreadSupport::MULTIPLE;
    return ThreadSupport::SINGLE;
#else
    return ThreadSupport::MULTIPLE;
#endif
}


/************************************************************************
 *  Function to perform a load balance of the given processes            *
 ************************************************************************/
void MPI_CLASS::balanceProcesses( const MPI_CLASS &globalComm, const int method,
    const std::vector<int> &procs, const int N_min_in, const int N_max_in )
{
    // Build the list of processors to use
    std::vector<int> cpus = procs;
    if ( cpus.empty() ) {
        for ( int i = 0; i < getNumberOfProcessors(); i++ )
            cpus.push_back( i );
    }
    // Handle the "easy cases"
    if ( method == 1 ) {
        // Trivial case where we do not need any communication
        setProcessAffinity( cpus );
        return;
    }
    // Get the sub-communicator for the current node
    MPI_CLASS nodeComm = globalComm.splitByNode();
    int N_min          = std::min<int>( std::max<int>( N_min_in, 1 ), cpus.size() );
    int N_max          = N_max_in;
    if ( N_max == -1 )
        N_max = cpus.size();
    N_max = std::min<int>( N_max, cpus.size() );
    MPI_ASSERT( N_max >= N_min );
    // Perform the load balance within the node
    if ( method == 2 ) {
        int N_proc = cpus.size() / nodeComm.getSize();
        N_proc     = std::max<int>( N_proc, N_min );
        N_proc     = std::min<int>( N_proc, N_max );
        std::vector<int> cpus2( N_proc, -1 );
        for ( int i = 0; i < N_proc; i++ )
            cpus2[i] = cpus[( nodeComm.getRank() * N_proc + i ) % cpus.size()];
        setProcessAffinity( cpus2 );
    } else {
        MPI_ERROR( "Unknown method for load balance" );
    }
}


/************************************************************************
 *  Empty constructor                                                    *
 ************************************************************************/
MPI_CLASS::MPI_CLASS()
{
// Initialize the data members to a defaul communicator of self
#ifdef USE_MPI
    communicator = MPI_COMM_NULL;
    d_maxTag     = 0x7FFFFFFF;
#else
    communicator = MPI_CLASS_COMM_NULL;
    d_maxTag     = mpi_max_tag;
#endif
    d_ranks       = nullptr;
    d_count       = nullptr;
    d_manage      = false;
    comm_rank     = 0;
    comm_size     = 1;
    d_isNull      = true;
    d_currentTag  = nullptr;
    d_call_abort  = true;
    tmp_alignment = -1;
}


/************************************************************************
 *  Empty deconstructor                                                  *
 ************************************************************************/
MPI_CLASS::~MPI_CLASS() { reset(); }
void MPI_CLASS::reset()
{
    // Decrement the count if used
    int count = -1;
    if ( d_count != nullptr )
        count = --( *d_count );
    if ( count == 0 ) {
        // We are holding that last reference to the MPI_Comm object, we need to free it
        if ( d_manage ) {
#ifdef USE_MPI
            MPI_Comm_set_errhandler( communicator, MPI_ERRORS_ARE_FATAL );
            int err = MPI_Comm_free( &communicator );
            if ( err != MPI_SUCCESS )
                MPI_ERROR( "Problem free'ing MPI_Comm object" );
            communicator = MPI_CLASS_COMM_NULL;
            ++N_MPI_Comm_destroyed;
#endif
        }
        if ( d_ranks != nullptr )
            delete[] d_ranks;
        delete d_count;
    }
    if ( d_currentTag == nullptr ) {
        // No tag index
    } else if ( d_currentTag[1] > 1 ) {
        --( d_currentTag[1] );
    } else {
        delete[] d_currentTag;
    }
    d_manage     = false;
    d_count      = nullptr;
    d_ranks      = nullptr;
    comm_rank    = 0;
    comm_size    = 1;
    d_maxTag     = 0;
    d_isNull     = true;
    d_currentTag = nullptr;
    d_call_abort = true;
}


/************************************************************************
 *  Copy constructors                                                    *
 ************************************************************************/
MPI_CLASS::MPI_CLASS( const MPI_CLASS &comm )
    : communicator( comm.communicator ),
      d_isNull( comm.d_isNull ),
      d_manage( comm.d_manage ),
      comm_rank( comm.comm_rank ),
      comm_size( comm.comm_size ),
      d_ranks( comm.d_ranks ),
      d_maxTag( comm.d_maxTag ),
      d_currentTag( comm.d_currentTag )
{
    // Initialize the data members to the existing comm object
    if ( d_currentTag != nullptr )
        ++d_currentTag[1];
    d_call_abort = comm.d_call_abort;
    // Set and increment the count
    d_count = comm.d_count;
    if ( d_count != nullptr )
        ++( *d_count );
    tmp_alignment = -1;
}
MPI_CLASS::MPI_CLASS( MPI_CLASS &&rhs ) : MPI_CLASS()
{
    std::swap( communicator, rhs.communicator );
    std::swap( d_isNull, rhs.d_isNull );
    std::swap( d_manage, rhs.d_manage );
    std::swap( d_call_abort, rhs.d_call_abort );
    std::swap( profile_level, rhs.profile_level );
    std::swap( comm_rank, rhs.comm_rank );
    std::swap( comm_size, rhs.comm_size );
    std::swap( d_ranks, rhs.d_ranks );
    std::swap( d_maxTag, rhs.d_maxTag );
    std::swap( d_currentTag, rhs.d_currentTag );
    std::swap( d_count, rhs.d_count );
    std::swap( tmp_alignment, rhs.tmp_alignment );
}


/************************************************************************
 *  Assignment operators                                                 *
 ************************************************************************/
MPI_CLASS &MPI_CLASS::operator=( const MPI_CLASS &comm )
{
    if ( this == &comm ) // protect against invalid self-assignment
        return *this;
    // Destroy the previous object
    this->reset();
    // Initialize the data members to the existing object
    this->communicator = comm.communicator;
    this->comm_rank    = comm.comm_rank;
    this->comm_size    = comm.comm_size;
    this->d_ranks      = comm.d_ranks;
    this->d_isNull     = comm.d_isNull;
    this->d_manage     = comm.d_manage;
    this->d_maxTag     = comm.d_maxTag;
    this->d_call_abort = comm.d_call_abort;
    this->d_currentTag = comm.d_currentTag;
    if ( this->d_currentTag != nullptr )
        ++( this->d_currentTag[1] );
    // Set and increment the count
    this->d_count = comm.d_count;
    if ( this->d_count != nullptr )
        ++( *d_count );
    this->tmp_alignment = -1;
    return *this;
}
MPI_CLASS &MPI_CLASS::operator=( MPI_CLASS &&rhs )
{
    if ( this == &rhs ) // protect against invalid self-assignment
        return *this;
    std::swap( communicator, rhs.communicator );
    std::swap( d_isNull, rhs.d_isNull );
    std::swap( d_manage, rhs.d_manage );
    std::swap( d_call_abort, rhs.d_call_abort );
    std::swap( profile_level, rhs.profile_level );
    std::swap( comm_rank, rhs.comm_rank );
    std::swap( comm_size, rhs.comm_size );
    std::swap( d_ranks, rhs.d_ranks );
    std::swap( d_maxTag, rhs.d_maxTag );
    std::swap( d_currentTag, rhs.d_currentTag );
    std::swap( d_count, rhs.d_count );
    std::swap( tmp_alignment, rhs.tmp_alignment );
    return *this;
}


/************************************************************************
 *  Constructor from existing MPI communicator                           *
 ************************************************************************/
int d_global_currentTag_world1[2] = { 1, 1 };
int d_global_currentTag_world2[2] = { 1, 1 };
int d_global_currentTag_self[2]   = { 1, 1 };
#ifdef USE_MPI
std::atomic_int d_global_count_world1 = { 1 };
std::atomic_int d_global_count_world2 = { 1 };
std::atomic_int d_global_count_self   = { 1 };
#endif
MPI_CLASS::MPI_CLASS( MPI_Comm comm, bool manage )
{
    d_count       = nullptr;
    d_ranks       = nullptr;
    d_manage      = false;
    tmp_alignment = -1;
    // Check if we are using our version of comm_world
    if ( comm == MPI_CLASS_COMM_WORLD ) {
        communicator = MPI_COMM_WORLD;
    } else if ( comm == MPI_CLASS_COMM_SELF ) {
        communicator = MPI_COMM_SELF;
    } else if ( comm == MPI_CLASS_COMM_NULL ) {
        communicator = MPI_COMM_NULL;
    } else {
        communicator = comm;
    }
#ifdef USE_MPI
    // We are using MPI, use the MPI communicator to initialize the data
    if ( communicator != MPI_COMM_NULL ) {
        // Set the MPI_SIZE_T datatype if it has not been set
        if ( MPI_SIZE_T == 0x0 )
            MPI_SIZE_T = getSizeTDataType();
        // Attach the error handler
        StackTrace::setMPIErrorHandler( communicator );
        // Get the communicator properties
        MPI_Comm_rank( communicator, &comm_rank );
        MPI_Comm_size( communicator, &comm_size );
        int flag, *val;
        int ierr = MPI_Comm_get_attr( communicator, MPI_TAG_UB, &val, &flag );
        MPI_ASSERT( ierr == MPI_SUCCESS );
        if ( flag == 0 ) {
            d_maxTag = 0x7FFFFFFF; // The tag is not a valid attribute (set to 2^31-1)
        } else {
            d_maxTag = *val;
            if ( d_maxTag < 0 ) {
                d_maxTag = 0x7FFFFFFF;
            } // The maximum tag is > a signed int (set to 2^31-1)
            MPI_INSIST( d_maxTag >= 0x7FFF, "maximum tag size is < MPI standard" );
        }
    } else {
        comm_rank = 1;
        comm_size = 0;
        d_maxTag  = 0x7FFFFFFF;
    }
    d_isNull = communicator == MPI_COMM_NULL;
    if ( manage && communicator != MPI_COMM_NULL && communicator != MPI_COMM_SELF &&
         communicator != MPI_COMM_WORLD )
        d_manage = true;
    // Create the count (Note: we do not need to worry about thread safety)
    if ( communicator == MPI_CLASS_COMM_WORLD ) {
        d_count = &d_global_count_world1;
        ++( *d_count );
    } else if ( communicator == MPI_COMM_WORLD ) {
        d_count = &d_global_count_world2;
        ++( *d_count );
    } else if ( communicator == MPI_COMM_SELF ) {
        d_count = &d_global_count_self;
        ++( *d_count );
    } else if ( communicator == MPI_COMM_NULL ) {
        d_count = nullptr;
    } else {
        d_count  = new std::atomic_int;
        *d_count = 1;
    }
    if ( d_manage )
        ++N_MPI_Comm_created;
    // Create d_ranks
    if ( comm_size > 1 ) {
        d_ranks    = new int[comm_size];
        d_ranks[0] = -1;
    }
#else
    // We are not using MPI, intialize based on the communicator
    NULL_USE( manage );
    comm_rank = 0;
    comm_size = 1;
    d_maxTag  = mpi_max_tag;
    d_isNull  = communicator == MPI_COMM_NULL;
    if ( d_isNull )
        comm_size    = 0;
#endif
    if ( communicator == MPI_CLASS_COMM_WORLD ) {
        d_currentTag = d_global_currentTag_world1;
        ++( this->d_currentTag[1] );
    } else if ( communicator == MPI_COMM_WORLD ) {
        d_currentTag = d_global_currentTag_world2;
        ++( this->d_currentTag[1] );
    } else if ( communicator == MPI_COMM_SELF ) {
        d_currentTag = d_global_currentTag_self;
        ++( this->d_currentTag[1] );
    } else if ( communicator == MPI_COMM_NULL ) {
        d_currentTag = nullptr;
    } else {
        d_currentTag    = new int[2];
        d_currentTag[0] = ( d_maxTag <= 0x10000 ) ? 1 : 0x1FFF;
        d_currentTag[1] = 1;
    }
    d_call_abort = true;
}


/************************************************************************
 *  Return the ranks of the communicator in the global comm              *
 ************************************************************************/
std::vector<int> MPI_CLASS::globalRanks() const
{
    // Get my global rank if it has not been set
    static int myGlobalRank = -1;
    if ( myGlobalRank == -1 ) {
#ifdef USE_MPI
        if ( MPI_active() )
            MPI_Comm_rank( MPI_CLASS_COMM_WORLD, &myGlobalRank );
#else
        myGlobalRank = 0;
#endif
    }
    // Check if we are dealing with a serial or null communicator
    if ( comm_size == 1 )
        return std::vector<int>( 1, myGlobalRank );
    if ( d_ranks == nullptr || communicator == MPI_COMM_NULL )
        return std::vector<int>();
    // Fill d_ranks if necessary
    if ( d_ranks[0] == -1 ) {
        if ( communicator == MPI_CLASS_COMM_WORLD ) {
            for ( int i = 0; i < comm_size; i++ )
                d_ranks[i] = i;
        } else {

            MPI_ASSERT( myGlobalRank != -1 );
            this->allGather( myGlobalRank, d_ranks );
        }
    }
    // Return d_ranks
    return std::vector<int>( d_ranks, d_ranks + comm_size );
}


/************************************************************************
 *  Generate a random number                                             *
 ************************************************************************/
size_t MPI_CLASS::rand() const
{
    size_t val = 0;
    if ( getRank() == 0 ) {
        static std::random_device rd;
        static std::mt19937 gen( rd() );
        static std::uniform_int_distribution<size_t> dist;
        val = dist( gen );
    }
    val = bcast( val, 0 );
    return val;
}


/************************************************************************
 *  Intersect two communicators                                          *
 ************************************************************************/
#ifdef USE_MPI
static inline void MPI_Group_free2( MPI_Group *group )
{
    if ( *group != MPI_GROUP_EMPTY ) {
        // MPICH is fine with free'ing an empty group, OpenMPI crashes
        MPI_Group_free( group );
    }
}
MPI_CLASS MPI_CLASS::intersect( const MPI_CLASS &comm1, const MPI_CLASS &comm2 )
{
    MPI_Group group1 = MPI_GROUP_EMPTY, group2 = MPI_GROUP_EMPTY;
    if ( !comm1.isNull() ) {
        MPI_Group_free2( &group1 );
        MPI_Comm_group( comm1.communicator, &group1 );
    }
    if ( !comm2.isNull() ) {
        MPI_Group_free2( &group2 );
        MPI_Comm_group( comm2.communicator, &group2 );
    }
    MPI_Group group12;
    MPI_Group_intersection( group1, group2, &group12 );
    int compare1, compare2;
    MPI_Group_compare( group1, group12, &compare1 );
    MPI_Group_compare( group2, group12, &compare2 );
    MPI_CLASS new_comm( MPI_CLASS_COMM_NULL );
    int size;
    MPI_Group_size( group12, &size );
    if ( compare1 != MPI_UNEQUAL && size != 0 ) {
        // The intersection matches comm1
        new_comm = comm1;
    } else if ( compare2 != MPI_UNEQUAL && size != 0 ) {
        // The intersection matches comm2
        new_comm = comm2;
    } else if ( comm1.isNull() ) {
        // comm1 is null, we can return safely (comm1 is needed for communication)
    } else {
        // The intersection is smaller than comm1 or comm2
        // Check if the new comm is nullptr for all processors
        int max_size = 0;
        MPI_Allreduce( &size, &max_size, 1, MPI_INT, MPI_MAX, comm1.communicator );
        if ( max_size == 0 ) {
            // We are dealing with completely disjoint sets
            new_comm = MPI_CLASS( MPI_CLASS_COMM_NULL, false );
        } else {
            // Create the new comm
            // Note: OpenMPI crashes if the intersection group is EMPTY for any processors
            // We will set it to SELF for the EMPTY processors, then create a nullptr comm later
            if ( group12 == MPI_GROUP_EMPTY ) {
                MPI_Group_free2( &group12 );
                MPI_Comm_group( MPI_COMM_SELF, &group12 );
            }
            MPI_Comm new_MPI_comm;
            MPI_Comm_create( comm1.communicator, group12, &new_MPI_comm );
            if ( size > 0 ) {
                // This is the valid case where we create a new intersection comm
                new_comm = MPI_CLASS( new_MPI_comm, true );
            } else {
                // We actually want a null comm for this communicator
                new_comm = MPI_CLASS( MPI_CLASS_COMM_NULL, false );
                MPI_Comm_free( &new_MPI_comm );
            }
        }
    }
    MPI_Group_free2( &group1 );
    MPI_Group_free2( &group2 );
    MPI_Group_free2( &group12 );
    return new_comm;
}
#else
MPI_CLASS MPI_CLASS::intersect( const MPI_CLASS &comm1, const MPI_CLASS &comm2 )
{
    if ( comm1.isNull() || comm2.isNull() )
        return MPI_CLASS( MPI_CLASS_COMM_NULL, false );
    MPI_ASSERT( comm1.comm_size == 1 && comm2.comm_size == 1 );
    return comm1;
}
#endif


/************************************************************************
 *  Split a comm						                                    *
 ************************************************************************/
MPI_CLASS MPI_CLASS::split( int color, int key ) const
{
    if ( d_isNull ) {
        return MPI_CLASS( MPI_CLASS_COMM_NULL );
    } else if ( comm_size == 1 ) {
        if ( color == -1 )
            return MPI_CLASS( MPI_CLASS_COMM_NULL );
        return dup();
    }
    MPI_Comm new_MPI_comm = MPI_CLASS_COMM_NULL;
#ifdef USE_MPI
    // USE MPI to split the communicator
    if ( color == -1 ) {
        check_MPI( MPI_Comm_split( communicator, MPI_UNDEFINED, key, &new_MPI_comm ) );
    } else {
        check_MPI( MPI_Comm_split( communicator, color, key, &new_MPI_comm ) );
    }
#endif
    // Create the new object
    NULL_USE( key );
    MPI_CLASS new_comm( new_MPI_comm, true );
    new_comm.d_call_abort = d_call_abort;
    return new_comm;
}
MPI_CLASS MPI_CLASS::splitByNode( int key ) const
{
    // Check if we are dealing with a single processor (trivial case)
    if ( comm_size == 1 )
        return this->split( 0, 0 );
    // Get the node name
    std::string name = MPI_CLASS::getNodeName();
    // Gather the names from all ranks
    std::vector<std::string> list( comm_size );
    allGather( name, &list[0] );
    // Create the colors
    std::vector<int> color( comm_size, -1 );
    color[0] = 0;
    for ( int i = 1; i < comm_size; i++ ) {
        const std::string tmp1 = list[i];
        for ( int j = 0; j < i; j++ ) {
            const std::string tmp2 = list[j];
            if ( tmp1 == tmp2 ) {
                color[i] = color[j];
                break;
            }
            color[i] = color[i - 1] + 1;
        }
    }
    MPI_CLASS new_comm = this->split( color[comm_rank], key );
    return new_comm;
}


/************************************************************************
 *  Duplicate an exisiting comm object                                   *
 ************************************************************************/
MPI_CLASS MPI_CLASS::dup() const
{
    if ( d_isNull )
        return MPI_CLASS( MPI_CLASS_COMM_NULL );
    MPI_Comm new_MPI_comm = communicator;
#if defined( USE_MPI ) || defined( USE_PETSC )
    // USE MPI to duplicate the communicator
    MPI_Comm_dup( communicator, &new_MPI_comm );
#else
    new_MPI_comm = uniqueGlobalComm;
    uniqueGlobalComm++;
#endif
    // Create the new comm object
    MPI_CLASS new_comm( new_MPI_comm, true );
    new_comm.d_isNull     = d_isNull;
    new_comm.d_call_abort = d_call_abort;
    return new_comm;
}


/************************************************************************
 *  Get the node name                                                    *
 ************************************************************************/
std::string MPI_CLASS::getNodeName()
{
#ifdef USE_MPI
    int length;
    char name[MPI_MAX_PROCESSOR_NAME + 1];
    memset( name, 0, MPI_MAX_PROCESSOR_NAME + 1 );
    MPI_Get_processor_name( name, &length );
    return std::string( name );
#else
    return "Node0";
#endif
}


/************************************************************************
 *  Overload operator ==                                                 *
 ************************************************************************/
bool MPI_CLASS::operator==( const MPI_CLASS &comm ) const
{
    return communicator == comm.communicator;
}


/************************************************************************
 *  Overload operator !=                                                 *
 ************************************************************************/
bool MPI_CLASS::operator!=( const MPI_CLASS &comm ) const
{
    return communicator != comm.communicator;
}


/************************************************************************
 *  Overload operator <                                                  *
 ************************************************************************/
bool MPI_CLASS::operator<( const MPI_CLASS &comm ) const
{
    MPI_ASSERT( !this->d_isNull && !comm.d_isNull );
    bool flag = true;
    // First check if either communicator is NULL
    if ( this->d_isNull )
        return false;
    if ( comm.d_isNull )
        flag = false;
    // Use compare to check if the comms are equal
    if ( compare( comm ) != 0 )
        return false;
    // Check that the size of the other communicator is > the current communicator size
    if ( comm_size >= comm.comm_size )
        flag = false;
// Check the union of the communicator groups
// this is < comm iff this group is a subgroup of comm's group
#ifdef USE_MPI
    MPI_Group group1 = MPI_GROUP_EMPTY, group2 = MPI_GROUP_EMPTY, group12 = MPI_GROUP_EMPTY;
    if ( !d_isNull )
        MPI_Comm_group( communicator, &group1 );
    if ( !comm.d_isNull )
        MPI_Comm_group( comm.communicator, &group2 );
    MPI_Group_union( group1, group2, &group12 );
    int compare;
    MPI_Group_compare( group2, group12, &compare );
    if ( compare == MPI_UNEQUAL )
        flag = false;
    MPI_Group_free( &group1 );
    MPI_Group_free( &group2 );
    MPI_Group_free( &group12 );
#endif
    // Perform a global reduce of the flag (equivalent to all operation)
    return allReduce( flag );
}


/************************************************************************
 *  Overload operator <=                                                 *
 ************************************************************************/
bool MPI_CLASS::operator<=( const MPI_CLASS &comm ) const
{
    MPI_ASSERT( !this->d_isNull && !comm.d_isNull );
    bool flag = true;
    // First check if either communicator is NULL
    if ( this->d_isNull )
        return false;
    if ( comm.d_isNull )
        flag = false;
#ifdef USE_MPI
    int world_size = 0;
    MPI_Comm_size( MPI_COMM_WORLD, &world_size );
    if ( comm.getSize() == world_size )
        return true;
    if ( getSize() == 1 && !comm.d_isNull )
        return true;
#endif
    // Use compare to check if the comms are equal
    if ( compare( comm ) != 0 )
        return true;
    // Check that the size of the other communicator is > the current communicator size
    // this is <= comm iff this group is a subgroup of comm's group
    if ( comm_size > comm.comm_size )
        flag = false;
// Check the unnion of the communicator groups
#ifdef USE_MPI
    MPI_Group group1, group2, group12;
    MPI_Comm_group( communicator, &group1 );
    MPI_Comm_group( comm.communicator, &group2 );
    MPI_Group_union( group1, group2, &group12 );
    int compare;
    MPI_Group_compare( group2, group12, &compare );
    if ( compare == MPI_UNEQUAL )
        flag = false;
    MPI_Group_free( &group1 );
    MPI_Group_free( &group2 );
    MPI_Group_free( &group12 );
#endif
    // Perform a global reduce of the flag (equivalent to all operation)
    return allReduce( flag );
}


/************************************************************************
 *  Overload operator >                                                  *
 ************************************************************************/
bool MPI_CLASS::operator>( const MPI_CLASS &comm ) const
{
    bool flag = true;
    // First check if either communicator is NULL
    if ( this->d_isNull )
        return false;
    if ( comm.d_isNull )
        flag = false;
    // Use compare to check if the comms are equal
    if ( compare( comm ) != 0 )
        return false;
    // Check that the size of the other communicator is > the current communicator size
    if ( comm_size <= comm.comm_size )
        flag = false;
// Check the unnion of the communicator groups
// this is > comm iff comm's group is a subgroup of this group
#ifdef USE_MPI
    MPI_Group group1 = MPI_GROUP_EMPTY, group2 = MPI_GROUP_EMPTY, group12 = MPI_GROUP_EMPTY;
    if ( !d_isNull )
        MPI_Comm_group( communicator, &group1 );
    if ( !comm.d_isNull )
        MPI_Comm_group( comm.communicator, &group2 );
    MPI_Group_union( group1, group2, &group12 );
    int compare;
    MPI_Group_compare( group1, group12, &compare );
    if ( compare == MPI_UNEQUAL )
        flag = false;
    MPI_Group_free( &group1 );
    MPI_Group_free( &group2 );
    MPI_Group_free( &group12 );
#endif
    // Perform a global reduce of the flag (equivalent to all operation)
    return allReduce( flag );
}


/************************************************************************
 *  Overload operator >=                                                 *
 ************************************************************************/
bool MPI_CLASS::operator>=( const MPI_CLASS &comm ) const
{
    bool flag = true;
    // First check if either communicator is NULL
    if ( this->d_isNull )
        return false;
    if ( comm.d_isNull )
        flag = false;
#ifdef USE_MPI
    int world_size = 0;
    MPI_Comm_size( MPI_COMM_WORLD, &world_size );
    if ( getSize() == world_size )
        return true;
    if ( comm.getSize() == 1 && !comm.d_isNull )
        return true;
#endif
    // Use compare to check if the comms are equal
    if ( compare( comm ) != 0 )
        return true;
    // Check that the size of the other communicator is > the current communicator size
    if ( comm_size < comm.comm_size )
        flag = false;
// Check the unnion of the communicator groups
// this is >= comm iff comm's group is a subgroup of this group
#ifdef USE_MPI
    MPI_Group group1 = MPI_GROUP_EMPTY, group2 = MPI_GROUP_EMPTY, group12 = MPI_GROUP_EMPTY;
    if ( !d_isNull )
        MPI_Comm_group( communicator, &group1 );
    if ( !comm.d_isNull )
        MPI_Comm_group( comm.communicator, &group2 );
    MPI_Group_union( group1, group2, &group12 );
    int compare;
    MPI_Group_compare( group1, group12, &compare );
    if ( compare == MPI_UNEQUAL )
        flag = false;
    MPI_Group_free( &group1 );
    MPI_Group_free( &group2 );
    MPI_Group_free( &group12 );
#endif
    // Perform a global reduce of the flag (equivalent to all operation)
    return allReduce( flag );
}


/************************************************************************
 *  Compare two comm objects                                             *
 ************************************************************************/
int MPI_CLASS::compare( const MPI_CLASS &comm ) const
{
    if ( communicator == comm.communicator )
        return 1;
#ifdef USE_MPI
    if ( d_isNull || comm.d_isNull )
        return 0;
    int result;
    check_MPI( MPI_Comm_compare( communicator, comm.communicator, &result ) );
    if ( result == MPI_IDENT )
        return 2;
    else if ( result == MPI_CONGRUENT )
        return 3;
    else if ( result == MPI_SIMILAR )
        return 4;
    else if ( result == MPI_UNEQUAL )
        return 0;
    MPI_ERROR( "Unknown results from comm compare" );
#else
    if ( comm.communicator == MPI_COMM_NULL || communicator == MPI_COMM_NULL )
        return 0;
    else
        return 3;
#endif
    return 0;
}


/************************************************************************
 *  Abort the program.                                                   *
 ************************************************************************/
void MPI_CLASS::setCallAbortInSerialInsteadOfExit( bool flag ) { d_call_abort = flag; }
void MPI_CLASS::abort() const
{
#ifdef USE_MPI
    MPI_Comm comm = communicator;
    if ( comm == MPI_COMM_NULL )
        comm = MPI_COMM_WORLD;
    if ( !MPI_active() ) {
        // MPI is not availible
        exit( -1 );
    } else if ( comm_size > 1 ) {
        MPI_Abort( comm, -1 );
    } else if ( d_call_abort ) {
        MPI_Abort( comm, -1 );
    } else {
        exit( -1 );
    }
#else
    exit( -1 );
#endif
}


/************************************************************************
 *  newTag                                                               *
 ************************************************************************/
int MPI_CLASS::newTag()
{
#ifdef USE_MPI
    // Syncronize the processes to ensure all ranks enter this call
    // Needed so the count will match
    barrier();
    // Return and increment the tag
    int tag = ( *d_currentTag )++;
    MPI_INSIST( tag <= d_maxTag, "Maximum number of tags exceeded\n" );
    return tag;
#else
    static int globalCurrentTag = 1;
    return globalCurrentTag++;
#endif
}


/************************************************************************
 *  allReduce                                                            *
 ************************************************************************/
bool MPI_CLASS::allReduce( const bool value ) const
{
    bool ret = value;
    if ( comm_size > 1 ) {
#ifdef USE_MPI
        MPI_Allreduce(
            (void *) &value, (void *) &ret, 1, MPI_UNSIGNED_CHAR, MPI_MIN, communicator );
#else
        MPI_ERROR( "This shouldn't be possible" );
#endif
    }
    return ret;
}


/************************************************************************
 *  anyReduce                                                            *
 ************************************************************************/
bool MPI_CLASS::anyReduce( const bool value ) const
{
    bool ret = value;
    if ( comm_size > 1 ) {
#ifdef USE_MPI
        MPI_Allreduce(
            (void *) &value, (void *) &ret, 1, MPI_UNSIGNED_CHAR, MPI_MAX, communicator );
#else
        MPI_ERROR( "This shouldn't be possible" );
#endif
    }
    return ret;
}


/************************************************************************
 *  call_sumReduce                                                       *
 *  Note: these specializations are only called when using MPI.          *
 ************************************************************************/
#ifdef USE_MPI
// unsigned char
template<>
void MPI_CLASS::call_sumReduce<unsigned char>(
    const unsigned char *send, unsigned char *recv, const int n ) const
{
    PROFILE_START( "sumReduce1<unsigned char>", profile_level );
    MPI_Allreduce( (void *) send, (void *) recv, n, MPI_UNSIGNED_CHAR, MPI_SUM, communicator );
    PROFILE_STOP( "sumReduce1<unsigned char>", profile_level );
}
template<>
void MPI_CLASS::call_sumReduce<unsigned char>( unsigned char *x, const int n ) const
{
    PROFILE_START( "sumReduce2<unsigned char>", profile_level );
    auto send = x;
    auto recv = new unsigned char[n];
    MPI_Allreduce( send, recv, n, MPI_UNSIGNED_CHAR, MPI_SUM, communicator );
    for ( int i = 0; i < n; i++ )
        x[i] = recv[i];
    delete[] recv;
    PROFILE_STOP( "sumReduce2<unsigned char>", profile_level );
}
// char
template<>
void MPI_CLASS::call_sumReduce<char>( const char *send, char *recv, const int n ) const
{
    PROFILE_START( "sumReduce1<char>", profile_level );
    MPI_Allreduce( (void *) send, (void *) recv, n, MPI_SIGNED_CHAR, MPI_SUM, communicator );
    PROFILE_STOP( "sumReduce1<char>", profile_level );
}
template<>
void MPI_CLASS::call_sumReduce<char>( char *x, const int n ) const
{
    PROFILE_START( "sumReduce2<char>", profile_level );
    auto send = x;
    auto recv = new char[n];
    MPI_Allreduce( send, recv, n, MPI_SIGNED_CHAR, MPI_SUM, communicator );
    for ( int i = 0; i < n; i++ )
        x[i] = recv[i];
    delete[] recv;
    PROFILE_STOP( "sumReduce2<char>", profile_level );
}
// unsigned int
template<>
void MPI_CLASS::call_sumReduce<unsigned int>(
    const unsigned int *send, unsigned int *recv, const int n ) const
{
    PROFILE_START( "sumReduce1<unsigned int>", profile_level );
    MPI_Allreduce( (void *) send, (void *) recv, n, MPI_UNSIGNED, MPI_SUM, communicator );
    PROFILE_STOP( "sumReduce1<unsigned int>", profile_level );
}
template<>
void MPI_CLASS::call_sumReduce<unsigned int>( unsigned int *x, const int n ) const
{
    PROFILE_START( "sumReduce2<unsigned int>", profile_level );
    auto send = x;
    auto recv = new unsigned int[n];
    MPI_Allreduce( send, recv, n, MPI_UNSIGNED, MPI_SUM, communicator );
    for ( int i = 0; i < n; i++ )
        x[i] = recv[i];
    delete[] recv;
    PROFILE_STOP( "sumReduce2<unsigned int>", profile_level );
}
// int
template<>
void MPI_CLASS::call_sumReduce<int>( const int *send, int *recv, const int n ) const
{
    PROFILE_START( "sumReduce1<int>", profile_level );
    MPI_Allreduce( (void *) send, (void *) recv, n, MPI_INT, MPI_SUM, communicator );
    PROFILE_STOP( "sumReduce1<int>", profile_level );
}
template<>
void MPI_CLASS::call_sumReduce<int>( int *x, const int n ) const
{
    PROFILE_START( "sumReduce2<int>", profile_level );
    auto send = x;
    auto recv = new int[n];
    MPI_Allreduce( send, recv, n, MPI_INT, MPI_SUM, communicator );
    for ( int i = 0; i < n; i++ )
        x[i] = recv[i];
    delete[] recv;
    PROFILE_STOP( "sumReduce2<int>", profile_level );
}
// long int
template<>
void MPI_CLASS::call_sumReduce<long int>( const long int *send, long int *recv, const int n ) const
{
    PROFILE_START( "sumReduce1<long int>", profile_level );
    MPI_Allreduce( (void *) send, (void *) recv, n, MPI_LONG, MPI_SUM, communicator );
    PROFILE_STOP( "sumReduce1<long int>", profile_level );
}
template<>
void MPI_CLASS::call_sumReduce<long int>( long int *x, const int n ) const
{
    PROFILE_START( "sumReduce2<long int>", profile_level );
    auto send = x;
    auto recv = new long int[n];
    MPI_Allreduce( send, recv, n, MPI_LONG, MPI_SUM, communicator );
    for ( int i = 0; i < n; i++ )
        x[i] = recv[i];
    delete[] recv;
    PROFILE_STOP( "sumReduce2<long int>", profile_level );
}
// unsigned long int
template<>
void MPI_CLASS::call_sumReduce<unsigned long>(
    const unsigned long *send, unsigned long *recv, const int n ) const
{
    PROFILE_START( "sumReduce1<unsigned long>", profile_level );
    MPI_Allreduce( (void *) send, (void *) recv, n, MPI_UNSIGNED_LONG, MPI_SUM, communicator );
    PROFILE_STOP( "sumReduce1<unsigned long>", profile_level );
}
template<>
void MPI_CLASS::call_sumReduce<unsigned long>( unsigned long *x, const int n ) const
{
    PROFILE_START( "sumReduce2<unsigned long>", profile_level );
    auto send = x;
    auto recv = new unsigned long int[n];
    MPI_Allreduce( send, recv, n, MPI_UNSIGNED_LONG, MPI_SUM, communicator );
    for ( int i = 0; i < n; i++ )
        x[i] = recv[i];
    delete[] recv;
    PROFILE_STOP( "sumReduce2<unsigned long>", profile_level );
}
// size_t
#ifdef USE_WINDOWS
template<>
void MPI_CLASS::call_sumReduce<size_t>( const size_t *send, size_t *recv, const int n ) const
{
    MPI_ASSERT( MPI_SIZE_T != 0 );
    PROFILE_START( "sumReduce1<size_t>", profile_level );
    MPI_Allreduce( (void *) send, (void *) recv, n, MPI_SIZE_T, MPI_SUM, communicator );
    PROFILE_STOP( "sumReduce1<size_t>", profile_level );
}
template<>
void MPI_CLASS::call_sumReduce<size_t>( size_t *x, const int n ) const
{
    MPI_ASSERT( MPI_SIZE_T != 0 );
    PROFILE_START( "sumReduce2<size_t>", profile_level );
    auto send = x;
    auto recv = new size_t[n];
    MPI_Allreduce( (void *) send, (void *) recv, n, MPI_SIZE_T, MPI_SUM, communicator );
    for ( int i = 0; i < n; i++ )
        x[i] = recv[i];
    delete[] recv;
    PROFILE_STOP( "sumReduce2<size_t>", profile_level );
}
#endif
// float
template<>
void MPI_CLASS::call_sumReduce<float>( const float *send, float *recv, const int n ) const
{
    PROFILE_START( "sumReduce1<float>", profile_level );
    MPI_Allreduce( (void *) send, (void *) recv, n, MPI_FLOAT, MPI_SUM, communicator );
    PROFILE_STOP( "sumReduce1<float>", profile_level );
}
template<>
void MPI_CLASS::call_sumReduce<float>( float *x, const int n ) const
{
    PROFILE_START( "sumReduce2<float>", profile_level );
    auto send = x;
    auto recv = new float[n];
    MPI_Allreduce( send, recv, n, MPI_FLOAT, MPI_SUM, communicator );
    for ( int i = 0; i < n; i++ )
        x[i] = recv[i];
    delete[] recv;
    PROFILE_STOP( "sumReduce2<float>", profile_level );
}
// double
template<>
void MPI_CLASS::call_sumReduce<double>( const double *send, double *recv, const int n ) const
{
    PROFILE_START( "sumReduce1<double>", profile_level );
    MPI_Allreduce( (void *) send, (void *) recv, n, MPI_DOUBLE, MPI_SUM, communicator );
    PROFILE_STOP( "sumReduce1<double>", profile_level );
}
template<>
void MPI_CLASS::call_sumReduce<double>( double *x, const int n ) const
{
    PROFILE_START( "sumReduce2<double>", profile_level );
    auto send = x;
    auto recv = new double[n];
    MPI_Allreduce( send, recv, n, MPI_DOUBLE, MPI_SUM, communicator );
    for ( int i = 0; i < n; i++ )
        x[i] = recv[i];
    delete[] recv;
    PROFILE_STOP( "sumReduce2<double>", profile_level );
}
// std::complex<double>
template<>
void MPI_CLASS::call_sumReduce<std::complex<double>>(
    const std::complex<double> *x, std::complex<double> *y, const int n ) const
{
    PROFILE_START( "sumReduce1<complex double>", profile_level );
    auto send = new double[2 * n];
    auto recv = new double[2 * n];
    for ( int i = 0; i < n; i++ ) {
        send[2 * i + 0] = real( x[i] );
        send[2 * i + 1] = imag( x[i] );
    }
    MPI_Allreduce( (void *) send, (void *) recv, 2 * n, MPI_DOUBLE, MPI_SUM, communicator );
    for ( int i = 0; i < n; i++ )
        y[i] = std::complex<double>( recv[2 * i + 0], recv[2 * i + 1] );
    delete[] send;
    delete[] recv;
    PROFILE_STOP( "sumReduce1<complex double>", profile_level );
}
template<>
void MPI_CLASS::call_sumReduce<std::complex<double>>( std::complex<double> *x, const int n ) const
{
    PROFILE_START( "sumReduce2<complex double>", profile_level );
    auto send = new double[2 * n];
    auto recv = new double[2 * n];
    for ( int i = 0; i < n; i++ ) {
        send[2 * i + 0] = real( x[i] );
        send[2 * i + 1] = imag( x[i] );
    }
    MPI_Allreduce( send, recv, 2 * n, MPI_DOUBLE, MPI_SUM, communicator );
    for ( int i = 0; i < n; i++ )
        x[i] = std::complex<double>( recv[2 * i + 0], recv[2 * i + 1] );
    delete[] send;
    delete[] recv;
    PROFILE_STOP( "sumReduce2<complex double>", profile_level );
}
#endif


/************************************************************************
 *  call_minReduce                                                       *
 *  Note: these specializations are only called when using MPI.          *
 ************************************************************************/
#ifdef USE_MPI
// unsigned char
template<>
void MPI_CLASS::call_minReduce<unsigned char>(
    const unsigned char *send, unsigned char *recv, const int n, int *comm_rank_of_min ) const
{
    if ( comm_rank_of_min == nullptr ) {
        PROFILE_START( "minReduce1<unsigned char>", profile_level );
        MPI_Allreduce( (void *) send, (void *) recv, n, MPI_UNSIGNED_CHAR, MPI_MIN, communicator );
        PROFILE_STOP( "minReduce1<unsigned char>", profile_level );
    } else {
        auto tmp = new int[n];
        for ( int i = 0; i < n; i++ )
            tmp[i] = send[i];
        call_minReduce<int>( tmp, n, comm_rank_of_min );
        for ( int i = 0; i < n; i++ )
            recv[i] = static_cast<unsigned char>( tmp[i] );
        delete[] tmp;
    }
}
template<>
void MPI_CLASS::call_minReduce<unsigned char>(
    unsigned char *x, const int n, int *comm_rank_of_min ) const
{
    if ( comm_rank_of_min == nullptr ) {
        PROFILE_START( "minReduce2<unsigned char>", profile_level );
        auto send = x;
        auto recv = new unsigned char[n];
        MPI_Allreduce( send, recv, n, MPI_UNSIGNED_CHAR, MPI_MIN, communicator );
        for ( int i = 0; i < n; i++ )
            x[i] = recv[i];
        delete[] recv;
        PROFILE_STOP( "minReduce2<unsigned char>", profile_level );
    } else {
        auto tmp = new int[n];
        for ( int i = 0; i < n; i++ )
            tmp[i] = x[i];
        call_minReduce<int>( tmp, n, comm_rank_of_min );
        for ( int i = 0; i < n; i++ )
            x[i] = static_cast<unsigned char>( tmp[i] );
        delete[] tmp;
    }
}
// char
template<>
void MPI_CLASS::call_minReduce<char>(
    const char *send, char *recv, const int n, int *comm_rank_of_min ) const
{
    if ( comm_rank_of_min == nullptr ) {
        PROFILE_START( "minReduce1<char>", profile_level );
        MPI_Allreduce( (void *) send, (void *) recv, n, MPI_SIGNED_CHAR, MPI_MIN, communicator );
        PROFILE_STOP( "minReduce1<char>", profile_level );
    } else {
        auto tmp = new int[n];
        for ( int i = 0; i < n; i++ )
            tmp[i] = send[i];
        call_minReduce<int>( tmp, n, comm_rank_of_min );
        for ( int i = 0; i < n; i++ )
            recv[i] = static_cast<char>( tmp[i] );
        delete[] tmp;
    }
}
template<>
void MPI_CLASS::call_minReduce<char>( char *x, const int n, int *comm_rank_of_min ) const
{
    if ( comm_rank_of_min == nullptr ) {
        PROFILE_START( "minReduce2<char>", profile_level );
        auto send = x;
        auto recv = new char[n];
        MPI_Allreduce( send, recv, n, MPI_SIGNED_CHAR, MPI_MIN, communicator );
        for ( int i = 0; i < n; i++ )
            x[i] = recv[i];
        delete[] recv;
        PROFILE_STOP( "minReduce2<char>", profile_level );
    } else {
        auto tmp = new int[n];
        for ( int i = 0; i < n; i++ )
            tmp[i] = x[i];
        call_minReduce<int>( tmp, n, comm_rank_of_min );
        for ( int i = 0; i < n; i++ )
            x[i] = static_cast<char>( tmp[i] );
        delete[] tmp;
    }
}
// unsigned int
template<>
void MPI_CLASS::call_minReduce<unsigned int>(
    const unsigned int *send, unsigned int *recv, const int n, int *comm_rank_of_min ) const
{
    if ( comm_rank_of_min == nullptr ) {
        PROFILE_START( "minReduce1<unsigned int>", profile_level );
        MPI_Allreduce( (void *) send, (void *) recv, n, MPI_UNSIGNED, MPI_MIN, communicator );
        PROFILE_STOP( "minReduce1<unsigned int>", profile_level );
    } else {
        auto tmp = new int[n];
        for ( int i = 0; i < n; i++ )
            tmp[i] = unsigned_to_signed( send[i] );
        call_minReduce<int>( tmp, n, comm_rank_of_min );
        for ( int i = 0; i < n; i++ )
            recv[i] = signed_to_unsigned( tmp[i] );
        delete[] tmp;
    }
}
template<>
void MPI_CLASS::call_minReduce<unsigned int>(
    unsigned int *x, const int n, int *comm_rank_of_min ) const
{
    if ( comm_rank_of_min == nullptr ) {
        PROFILE_START( "minReduce2<unsigned int>", profile_level );
        auto send = x;
        auto recv = new unsigned int[n];
        MPI_Allreduce( send, recv, n, MPI_UNSIGNED, MPI_MIN, communicator );
        for ( int i = 0; i < n; i++ )
            x[i] = recv[i];
        delete[] recv;
        PROFILE_STOP( "minReduce2<unsigned int>", profile_level );
    } else {
        auto tmp = new int[n];
        for ( int i = 0; i < n; i++ )
            tmp[i] = unsigned_to_signed( x[i] );
        call_minReduce<int>( tmp, n, comm_rank_of_min );
        for ( int i = 0; i < n; i++ )
            x[i] = signed_to_unsigned( tmp[i] );
        delete[] tmp;
    }
}
// int
template<>
void MPI_CLASS::call_minReduce<int>(
    const int *x, int *y, const int n, int *comm_rank_of_min ) const
{
    PROFILE_START( "minReduce1<int>", profile_level );
    if ( comm_rank_of_min == nullptr ) {
        MPI_Allreduce( (void *) x, (void *) y, n, MPI_INT, MPI_MIN, communicator );
    } else {
        auto recv = new IntIntStruct[n];
        auto send = new IntIntStruct[n];
        for ( int i = 0; i < n; ++i ) {
            send[i].j = x[i];
            send[i].i = comm_rank;
        }
        MPI_Allreduce( send, recv, n, MPI_2INT, MPI_MINLOC, communicator );
        for ( int i = 0; i < n; ++i ) {
            y[i]                = recv[i].j;
            comm_rank_of_min[i] = recv[i].i;
        }
        delete[] recv;
        delete[] send;
    }
    PROFILE_STOP( "minReduce1<int>", profile_level );
}
template<>
void MPI_CLASS::call_minReduce<int>( int *x, const int n, int *comm_rank_of_min ) const
{
    PROFILE_START( "minReduce2<int>", profile_level );
    if ( comm_rank_of_min == nullptr ) {
        auto send = x;
        auto recv = new int[n];
        MPI_Allreduce( send, recv, n, MPI_INT, MPI_MIN, communicator );
        for ( int i = 0; i < n; i++ )
            x[i] = recv[i];
        delete[] recv;
    } else {
        auto recv = new IntIntStruct[n];
        auto send = new IntIntStruct[n];
        for ( int i = 0; i < n; ++i ) {
            send[i].j = x[i];
            send[i].i = comm_rank;
        }
        MPI_Allreduce( send, recv, n, MPI_2INT, MPI_MINLOC, communicator );
        for ( int i = 0; i < n; ++i ) {
            x[i]                = recv[i].j;
            comm_rank_of_min[i] = recv[i].i;
        }
        delete[] recv;
        delete[] send;
    }
    PROFILE_STOP( "minReduce2<int>", profile_level );
}
// unsigned long int
template<>
void MPI_CLASS::call_minReduce<unsigned long int>( const unsigned long int *send,
    unsigned long int *recv, const int n, int *comm_rank_of_min ) const
{
    if ( comm_rank_of_min == nullptr ) {
        PROFILE_START( "minReduce1<unsigned long>", profile_level );
        MPI_Allreduce( (void *) send, (void *) recv, n, MPI_UNSIGNED_LONG, MPI_MIN, communicator );
        PROFILE_STOP( "minReduce1<unsigned long>", profile_level );
    } else {
        auto tmp = new long int[n];
        for ( int i = 0; i < n; i++ )
            tmp[i] = unsigned_to_signed( send[i] );
        call_minReduce<long int>( tmp, n, comm_rank_of_min );
        for ( int i = 0; i < n; i++ )
            recv[i] = signed_to_unsigned( tmp[i] );
        delete[] tmp;
    }
}
template<>
void MPI_CLASS::call_minReduce<unsigned long int>(
    unsigned long int *x, const int n, int *comm_rank_of_min ) const
{
    if ( comm_rank_of_min == nullptr ) {
        PROFILE_START( "minReduce2<unsigned long>", profile_level );
        auto send = x;
        auto recv = new unsigned long int[n];
        MPI_Allreduce( send, recv, n, MPI_UNSIGNED_LONG, MPI_MIN, communicator );
        for ( int i = 0; i < n; i++ )
            x[i] = recv[i];
        delete[] recv;
        PROFILE_STOP( "minReduce2<unsigned long>", profile_level );
    } else {
        auto tmp = new long int[n];
        for ( int i = 0; i < n; i++ )
            tmp[i] = unsigned_to_signed( x[i] );
        call_minReduce<long int>( tmp, n, comm_rank_of_min );
        for ( int i = 0; i < n; i++ )
            x[i] = signed_to_unsigned( tmp[i] );
        delete[] tmp;
    }
}
// long int
template<>
void MPI_CLASS::call_minReduce<long int>(
    const long int *x, long int *y, const int n, int *comm_rank_of_min ) const
{
    PROFILE_START( "minReduce1<long int>", profile_level );
    if ( comm_rank_of_min == nullptr ) {
        MPI_Allreduce( (void *) x, (void *) y, n, MPI_LONG, MPI_MIN, communicator );
    } else {
        auto recv = new LongIntStruct[n];
        auto send = new LongIntStruct[n];
        for ( int i = 0; i < n; ++i ) {
            send[i].j = x[i];
            send[i].i = comm_rank;
        }
        MPI_Allreduce( send, recv, n, MPI_LONG_INT, MPI_MINLOC, communicator );
        for ( int i = 0; i < n; ++i ) {
            y[i]                = recv[i].j;
            comm_rank_of_min[i] = recv[i].i;
        }
        delete[] recv;
        delete[] send;
    }
    PROFILE_STOP( "minReduce1<long int>", profile_level );
}
template<>
void MPI_CLASS::call_minReduce<long int>( long int *x, const int n, int *comm_rank_of_min ) const
{
    PROFILE_START( "minReduce2<long int>", profile_level );
    if ( comm_rank_of_min == nullptr ) {
        auto send = x;
        auto recv = new long int[n];
        MPI_Allreduce( send, recv, n, MPI_LONG, MPI_MIN, communicator );
        for ( long int i = 0; i < n; i++ )
            x[i] = recv[i];
        delete[] recv;
    } else {
        auto recv = new LongIntStruct[n];
        auto send = new LongIntStruct[n];
        for ( int i = 0; i < n; ++i ) {
            send[i].j = x[i];
            send[i].i = comm_rank;
        }
        MPI_Allreduce( send, recv, n, MPI_LONG_INT, MPI_MINLOC, communicator );
        for ( int i = 0; i < n; ++i ) {
            x[i]                = recv[i].j;
            comm_rank_of_min[i] = recv[i].i;
        }
        delete[] recv;
        delete[] send;
    }
    PROFILE_STOP( "minReduce2<long int>", profile_level );
}
// unsigned long long int
template<>
void MPI_CLASS::call_minReduce<unsigned long long int>( const unsigned long long int *send,
    unsigned long long int *recv, const int n, int *comm_rank_of_min ) const
{
    PROFILE_START( "minReduce1<long int>", profile_level );
    if ( comm_rank_of_min == nullptr ) {
        auto x = new long long int[n];
        auto y = new long long int[n];
        for ( int i = 0; i < n; i++ )
            x[i] = unsigned_to_signed( send[i] );
        MPI_Allreduce( (void *) x, (void *) y, n, MPI_LONG_LONG_INT, MPI_MIN, communicator );
        for ( int i = 0; i < n; i++ )
            recv[i] = signed_to_unsigned( y[i] );
        delete[] x;
        delete[] y;
    } else {
        printf( "minReduce<long long int> will use double\n" );
        auto tmp = new double[n];
        for ( int i = 0; i < n; i++ )
            tmp[i] = static_cast<double>( send[i] );
        call_minReduce<double>( tmp, n, comm_rank_of_min );
        for ( int i = 0; i < n; i++ )
            recv[i] = static_cast<long long int>( tmp[i] );
        delete[] tmp;
    }
    PROFILE_STOP( "minReduce1<long int>", profile_level );
}
template<>
void MPI_CLASS::call_minReduce<unsigned long long int>(
    unsigned long long int *x, const int n, int *comm_rank_of_min ) const
{
    auto recv = new unsigned long long int[n];
    call_minReduce<unsigned long long int>( x, recv, n, comm_rank_of_min );
    for ( int i = 0; i < n; i++ )
        x[i] = recv[i];
    delete[] recv;
}
// long long int
template<>
void MPI_CLASS::call_minReduce<long long int>(
    const long long int *x, long long int *y, const int n, int *comm_rank_of_min ) const
{
    PROFILE_START( "minReduce1<long int>", profile_level );
    if ( comm_rank_of_min == nullptr ) {
        MPI_Allreduce( (void *) x, (void *) y, n, MPI_LONG_LONG_INT, MPI_MIN, communicator );
    } else {
        printf( "minReduce<long long int> will use double\n" );
        auto tmp = new double[n];
        for ( int i = 0; i < n; i++ )
            tmp[i] = static_cast<double>( x[i] );
        call_minReduce<double>( tmp, n, comm_rank_of_min );
        for ( int i = 0; i < n; i++ )
            y[i] = static_cast<long long int>( tmp[i] );
        delete[] tmp;
    }
    PROFILE_STOP( "minReduce1<long int>", profile_level );
}
template<>
void MPI_CLASS::call_minReduce<long long int>(
    long long int *x, const int n, int *comm_rank_of_min ) const
{
    auto recv = new long long int[n];
    call_minReduce<long long int>( x, recv, n, comm_rank_of_min );
    for ( int i = 0; i < n; i++ )
        x[i] = signed_to_unsigned( recv[i] );
    delete[] recv;
}
// float
template<>
void MPI_CLASS::call_minReduce<float>(
    const float *x, float *y, const int n, int *comm_rank_of_min ) const
{
    PROFILE_START( "minReduce1<float>", profile_level );
    if ( comm_rank_of_min == nullptr ) {
        MPI_Allreduce( (void *) x, (void *) y, n, MPI_INT, MPI_MIN, communicator );
    } else {
        auto recv = new FloatIntStruct[n];
        auto send = new FloatIntStruct[n];
        for ( int i = 0; i < n; ++i ) {
            send[i].f = x[i];
            send[i].i = comm_rank;
        }
        MPI_Allreduce( send, recv, n, MPI_FLOAT_INT, MPI_MINLOC, communicator );
        for ( int i = 0; i < n; ++i ) {
            y[i]                = recv[i].f;
            comm_rank_of_min[i] = recv[i].i;
        }
        delete[] recv;
        delete[] send;
    }
    PROFILE_STOP( "minReduce1<float>", profile_level );
}
template<>
void MPI_CLASS::call_minReduce<float>( float *x, const int n, int *comm_rank_of_min ) const
{
    PROFILE_START( "minReduce2<float>", profile_level );
    if ( comm_rank_of_min == nullptr ) {
        auto send = x;
        auto recv = new float[n];
        MPI_Allreduce( send, recv, n, MPI_FLOAT, MPI_MIN, communicator );
        for ( int i = 0; i < n; i++ )
            x[i] = recv[i];
        delete[] recv;
    } else {
        auto recv = new FloatIntStruct[n];
        auto send = new FloatIntStruct[n];
        for ( int i = 0; i < n; ++i ) {
            send[i].f = x[i];
            send[i].i = comm_rank;
        }
        MPI_Allreduce( send, recv, n, MPI_FLOAT_INT, MPI_MINLOC, communicator );
        for ( int i = 0; i < n; ++i ) {
            x[i]                = recv[i].f;
            comm_rank_of_min[i] = recv[i].i;
        }
        delete[] recv;
        delete[] send;
    }
    PROFILE_STOP( "minReduce2<float>", profile_level );
}
// double
template<>
void MPI_CLASS::call_minReduce<double>(
    const double *x, double *y, const int n, int *comm_rank_of_min ) const
{
    PROFILE_START( "minReduce1<double>", profile_level );
    if ( comm_rank_of_min == nullptr ) {
        MPI_Allreduce( (void *) x, (void *) y, n, MPI_DOUBLE, MPI_MIN, communicator );
    } else {
        auto recv = new DoubleIntStruct[n];
        auto send = new DoubleIntStruct[n];
        for ( int i = 0; i < n; ++i ) {
            send[i].d = x[i];
            send[i].i = comm_rank;
        }
        MPI_Allreduce( send, recv, n, MPI_DOUBLE_INT, MPI_MINLOC, communicator );
        for ( int i = 0; i < n; ++i ) {
            y[i]                = recv[i].d;
            comm_rank_of_min[i] = recv[i].i;
        }
        delete[] recv;
        delete[] send;
    }
    PROFILE_STOP( "minReduce1<double>", profile_level );
}
template<>
void MPI_CLASS::call_minReduce<double>( double *x, const int n, int *comm_rank_of_min ) const
{
    PROFILE_START( "minReduce2<double>", profile_level );
    if ( comm_rank_of_min == nullptr ) {
        auto send = x;
        auto recv = new double[n];
        MPI_Allreduce( send, recv, n, MPI_DOUBLE, MPI_MIN, communicator );
        for ( int i = 0; i < n; i++ )
            x[i] = recv[i];
        delete[] recv;
    } else {
        auto recv = new DoubleIntStruct[n];
        auto send = new DoubleIntStruct[n];
        for ( int i = 0; i < n; ++i ) {
            send[i].d = x[i];
            send[i].i = comm_rank;
        }
        MPI_Allreduce( send, recv, n, MPI_DOUBLE_INT, MPI_MINLOC, communicator );
        for ( int i = 0; i < n; ++i ) {
            x[i]                = recv[i].d;
            comm_rank_of_min[i] = recv[i].i;
        }
        delete[] recv;
        delete[] send;
    }
    PROFILE_STOP( "minReduce2<double>", profile_level );
}
#endif


/************************************************************************
 *  call_maxReduce                                                    *
 *  Note: these specializations are only called when using MPI.          *
 ************************************************************************/
#ifdef USE_MPI
// unsigned char
template<>
void MPI_CLASS::call_maxReduce<unsigned char>(
    const unsigned char *send, unsigned char *recv, const int n, int *comm_rank_of_max ) const
{
    if ( comm_rank_of_max == nullptr ) {
        PROFILE_START( "maxReduce1<unsigned char>", profile_level );
        MPI_Allreduce( (void *) send, (void *) recv, n, MPI_UNSIGNED_CHAR, MPI_MAX, communicator );
        PROFILE_STOP( "maxReduce1<unsigned char>", profile_level );
    } else {
        auto tmp = new int[n];
        for ( int i = 0; i < n; i++ )
            tmp[i] = send[i];
        call_maxReduce<int>( tmp, n, comm_rank_of_max );
        for ( int i = 0; i < n; i++ )
            recv[i] = static_cast<unsigned char>( tmp[i] );
        delete[] tmp;
    }
}
template<>
void MPI_CLASS::call_maxReduce<unsigned char>(
    unsigned char *x, const int n, int *comm_rank_of_max ) const
{
    if ( comm_rank_of_max == nullptr ) {
        PROFILE_START( "maxReduce2<unsigned char>", profile_level );
        auto send = x;
        auto recv = new unsigned char[n];
        MPI_Allreduce( send, recv, n, MPI_UNSIGNED_CHAR, MPI_MAX, communicator );
        for ( int i = 0; i < n; i++ )
            x[i] = recv[i];
        delete[] recv;
        PROFILE_STOP( "maxReduce2<unsigned char>", profile_level );
    } else {
        auto tmp = new int[n];
        for ( int i = 0; i < n; i++ )
            tmp[i] = x[i];
        call_maxReduce<int>( tmp, n, comm_rank_of_max );
        for ( int i = 0; i < n; i++ )
            x[i] = static_cast<unsigned char>( tmp[i] );
        delete[] tmp;
    }
}
// char
template<>
void MPI_CLASS::call_maxReduce<char>(
    const char *send, char *recv, const int n, int *comm_rank_of_max ) const
{
    if ( comm_rank_of_max == nullptr ) {
        PROFILE_START( "maxReduce1<char>", profile_level );
        MPI_Allreduce( (void *) send, (void *) recv, n, MPI_SIGNED_CHAR, MPI_MAX, communicator );
        PROFILE_STOP( "maxReduce1<char>", profile_level );
    } else {
        auto tmp = new int[n];
        for ( int i = 0; i < n; i++ )
            tmp[i] = send[i];
        call_maxReduce<int>( tmp, n, comm_rank_of_max );
        for ( int i = 0; i < n; i++ )
            recv[i] = static_cast<char>( tmp[i] );
        delete[] tmp;
    }
}
template<>
void MPI_CLASS::call_maxReduce<char>( char *x, const int n, int *comm_rank_of_max ) const
{
    if ( comm_rank_of_max == nullptr ) {
        PROFILE_START( "maxReduce2<char>", profile_level );
        auto send = x;
        auto recv = new char[n];
        MPI_Allreduce( send, recv, n, MPI_SIGNED_CHAR, MPI_MAX, communicator );
        for ( int i = 0; i < n; i++ )
            x[i] = recv[i];
        delete[] recv;
        PROFILE_STOP( "maxReduce2<char>", profile_level );
    } else {
        auto tmp = new int[n];
        for ( int i = 0; i < n; i++ )
            tmp[i] = x[i];
        call_maxReduce<int>( tmp, n, comm_rank_of_max );
        for ( int i = 0; i < n; i++ )
            x[i] = static_cast<char>( tmp[i] );
        delete[] tmp;
    }
}
// unsigned int
template<>
void MPI_CLASS::call_maxReduce<unsigned int>(
    const unsigned int *send, unsigned int *recv, const int n, int *comm_rank_of_max ) const
{
    if ( comm_rank_of_max == nullptr ) {
        PROFILE_START( "maxReduce1<unsigned int>", profile_level );
        MPI_Allreduce( (void *) send, (void *) recv, n, MPI_UNSIGNED, MPI_MAX, communicator );
        PROFILE_STOP( "maxReduce1<unsigned int>", profile_level );
    } else {
        auto tmp = new int[n];
        for ( int i = 0; i < n; i++ )
            tmp[i] = unsigned_to_signed( send[i] );
        call_maxReduce<int>( tmp, n, comm_rank_of_max );
        for ( int i = 0; i < n; i++ )
            recv[i] = signed_to_unsigned( tmp[i] );
        delete[] tmp;
    }
}
template<>
void MPI_CLASS::call_maxReduce<unsigned int>(
    unsigned int *x, const int n, int *comm_rank_of_max ) const
{
    if ( comm_rank_of_max == nullptr ) {
        PROFILE_START( "maxReduce2<unsigned int>", profile_level );
        auto send = x;
        auto recv = new unsigned int[n];
        MPI_Allreduce( send, recv, n, MPI_UNSIGNED, MPI_MAX, communicator );
        for ( int i = 0; i < n; i++ )
            x[i] = recv[i];
        delete[] recv;
        PROFILE_STOP( "maxReduce2<unsigned int>", profile_level );
    } else {
        auto tmp = new int[n];
        for ( int i = 0; i < n; i++ )
            tmp[i] = unsigned_to_signed( x[i] );
        call_maxReduce<int>( tmp, n, comm_rank_of_max );
        for ( int i = 0; i < n; i++ )
            x[i] = signed_to_unsigned( tmp[i] );
        delete[] tmp;
    }
}
// int
template<>
void MPI_CLASS::call_maxReduce<int>(
    const int *x, int *y, const int n, int *comm_rank_of_max ) const
{
    PROFILE_START( "maxReduce1<int>", profile_level );
    if ( comm_rank_of_max == nullptr ) {
        MPI_Allreduce( (void *) x, (void *) y, n, MPI_INT, MPI_MAX, communicator );
    } else {
        auto recv = new IntIntStruct[n];
        auto send = new IntIntStruct[n];
        for ( int i = 0; i < n; ++i ) {
            send[i].j = x[i];
            send[i].i = comm_rank;
        }
        MPI_Allreduce( send, recv, n, MPI_2INT, MPI_MAXLOC, communicator );
        for ( int i = 0; i < n; ++i ) {
            y[i]                = recv[i].j;
            comm_rank_of_max[i] = recv[i].i;
        }
        delete[] recv;
        delete[] send;
    }
    PROFILE_STOP( "maxReduce1<int>", profile_level );
}
template<>
void MPI_CLASS::call_maxReduce<int>( int *x, const int n, int *comm_rank_of_max ) const
{
    PROFILE_START( "maxReduce2<int>", profile_level );
    if ( comm_rank_of_max == nullptr ) {
        int *send = x;
        auto recv = new int[n];
        MPI_Allreduce( send, recv, n, MPI_INT, MPI_MAX, communicator );
        for ( int i = 0; i < n; i++ )
            x[i] = recv[i];
        delete[] recv;
    } else {
        auto recv = new IntIntStruct[n];
        auto send = new IntIntStruct[n];
        for ( int i = 0; i < n; ++i ) {
            send[i].j = x[i];
            send[i].i = comm_rank;
        }
        MPI_Allreduce( send, recv, n, MPI_2INT, MPI_MAXLOC, communicator );
        for ( int i = 0; i < n; ++i ) {
            x[i]                = recv[i].j;
            comm_rank_of_max[i] = recv[i].i;
        }
        delete[] recv;
        delete[] send;
    }
    PROFILE_STOP( "maxReduce2<int>", profile_level );
}
// long int
template<>
void MPI_CLASS::call_maxReduce<long int>(
    const long int *x, long int *y, const int n, int *comm_rank_of_max ) const
{
    PROFILE_START( "maxReduce1<lond int>", profile_level );
    if ( comm_rank_of_max == nullptr ) {
        MPI_Allreduce( (void *) x, (void *) y, n, MPI_LONG, MPI_MAX, communicator );
    } else {
        auto recv = new LongIntStruct[n];
        auto send = new LongIntStruct[n];
        for ( int i = 0; i < n; ++i ) {
            send[i].j = x[i];
            send[i].i = comm_rank;
        }
        MPI_Allreduce( send, recv, n, MPI_LONG_INT, MPI_MAXLOC, communicator );
        for ( int i = 0; i < n; ++i ) {
            y[i]                = recv[i].j;
            comm_rank_of_max[i] = recv[i].i;
        }
        delete[] recv;
        delete[] send;
    }
    PROFILE_STOP( "maxReduce1<lond int>", profile_level );
}
template<>
void MPI_CLASS::call_maxReduce<long int>( long int *x, const int n, int *comm_rank_of_max ) const
{
    PROFILE_START( "maxReduce2<lond int>", profile_level );
    if ( comm_rank_of_max == nullptr ) {
        auto send = x;
        auto recv = new long int[n];
        MPI_Allreduce( send, recv, n, MPI_LONG, MPI_MAX, communicator );
        for ( int i = 0; i < n; i++ )
            x[i] = recv[i];
        delete[] recv;
    } else {
        auto recv = new LongIntStruct[n];
        auto send = new LongIntStruct[n];
        for ( int i = 0; i < n; ++i ) {
            send[i].j = x[i];
            send[i].i = comm_rank;
        }
        MPI_Allreduce( send, recv, n, MPI_LONG_INT, MPI_MAXLOC, communicator );
        for ( int i = 0; i < n; ++i ) {
            x[i]                = recv[i].j;
            comm_rank_of_max[i] = recv[i].i;
        }
        delete[] recv;
        delete[] send;
    }
    PROFILE_STOP( "maxReduce2<lond int>", profile_level );
}
// unsigned long int
template<>
void MPI_CLASS::call_maxReduce<unsigned long int>( const unsigned long int *send,
    unsigned long int *recv, const int n, int *comm_rank_of_max ) const
{
    if ( comm_rank_of_max == nullptr ) {
        PROFILE_START( "maxReduce1<unsigned long>", profile_level );
        MPI_Allreduce( (void *) send, (void *) recv, n, MPI_UNSIGNED_LONG, MPI_MAX, communicator );
        PROFILE_STOP( "maxReduce1<unsigned long>", profile_level );
    } else {
        auto tmp = new long int[n];
        for ( int i = 0; i < n; i++ )
            tmp[i] = unsigned_to_signed( send[i] );
        call_maxReduce<long int>( tmp, n, comm_rank_of_max );
        for ( int i = 0; i < n; i++ )
            recv[i] = signed_to_unsigned( tmp[i] );
        delete[] tmp;
    }
}
template<>
void MPI_CLASS::call_maxReduce<unsigned long int>(
    unsigned long int *x, const int n, int *comm_rank_of_max ) const
{
    if ( comm_rank_of_max == nullptr ) {
        PROFILE_START( "maxReduce2<unsigned long>", profile_level );
        auto send = x;
        auto recv = new unsigned long int[n];
        MPI_Allreduce( send, recv, n, MPI_UNSIGNED_LONG, MPI_MAX, communicator );
        for ( int i = 0; i < n; i++ )
            x[i] = recv[i];
        delete[] recv;
        PROFILE_STOP( "maxReduce2<unsigned long>", profile_level );
    } else {
        auto tmp = new long int[n];
        for ( int i = 0; i < n; i++ )
            tmp[i] = unsigned_to_signed( x[i] );
        call_maxReduce<long int>( tmp, n, comm_rank_of_max );
        for ( int i = 0; i < n; i++ )
            x[i] = signed_to_unsigned( tmp[i] );
        delete[] tmp;
    }
}
// unsigned long long int
template<>
void MPI_CLASS::call_maxReduce<unsigned long long int>( const unsigned long long int *send,
    unsigned long long int *recv, const int n, int *comm_rank_of_max ) const
{
    PROFILE_START( "maxReduce1<long int>", profile_level );
    if ( comm_rank_of_max == nullptr ) {
        auto x = new long long int[n];
        auto y = new long long int[n];
        for ( int i = 0; i < n; i++ )
            x[i] = unsigned_to_signed( send[i] );
        MPI_Allreduce( (void *) x, (void *) y, n, MPI_LONG_LONG_INT, MPI_MAX, communicator );
        for ( int i = 0; i < n; i++ )
            recv[i] = signed_to_unsigned( y[i] );
        delete[] x;
        delete[] y;
    } else {
        printf( "maxReduce<long long int> will use double\n" );
        auto tmp = new double[n];
        for ( int i = 0; i < n; i++ )
            tmp[i] = static_cast<double>( send[i] );
        call_maxReduce<double>( tmp, n, comm_rank_of_max );
        for ( int i = 0; i < n; i++ )
            recv[i] = static_cast<long long int>( tmp[i] );
        delete[] tmp;
    }
    PROFILE_STOP( "maxReduce1<long int>", profile_level );
}
template<>
void MPI_CLASS::call_maxReduce<unsigned long long int>(
    unsigned long long int *x, const int n, int *comm_rank_of_max ) const
{
    auto recv = new unsigned long long int[n];
    call_maxReduce<unsigned long long int>( x, recv, n, comm_rank_of_max );
    for ( int i = 0; i < n; i++ )
        x[i] = recv[i];
    delete[] recv;
}
// long long int
template<>
void MPI_CLASS::call_maxReduce<long long int>(
    const long long int *x, long long int *y, const int n, int *comm_rank_of_max ) const
{
    PROFILE_START( "maxReduce1<long int>", profile_level );
    if ( comm_rank_of_max == nullptr ) {
        MPI_Allreduce( (void *) x, (void *) y, n, MPI_LONG_LONG_INT, MPI_MAX, communicator );
    } else {
        printf( "maxReduce<long long int> will use double\n" );
        auto tmp = new double[n];
        for ( int i = 0; i < n; i++ )
            tmp[i] = static_cast<double>( x[i] );
        call_maxReduce<double>( tmp, n, comm_rank_of_max );
        for ( int i = 0; i < n; i++ )
            y[i] = static_cast<long long int>( tmp[i] );
        delete[] tmp;
    }
    PROFILE_STOP( "maxReduce1<long int>", profile_level );
}
template<>
void MPI_CLASS::call_maxReduce<long long int>(
    long long int *x, const int n, int *comm_rank_of_max ) const
{
    auto recv = new long long int[n];
    call_maxReduce<long long int>( x, recv, n, comm_rank_of_max );
    for ( int i = 0; i < n; i++ )
        x[i] = signed_to_unsigned( recv[i] );
    delete[] recv;
}
// float
template<>
void MPI_CLASS::call_maxReduce<float>(
    const float *x, float *y, const int n, int *comm_rank_of_max ) const
{
    PROFILE_START( "maxReduce1<float>", profile_level );
    if ( comm_rank_of_max == nullptr ) {
        MPI_Allreduce( (void *) x, (void *) y, n, MPI_FLOAT, MPI_MAX, communicator );
    } else {
        auto recv = new FloatIntStruct[n];
        auto send = new FloatIntStruct[n];
        for ( int i = 0; i < n; ++i ) {
            send[i].f = x[i];
            send[i].i = comm_rank;
        }
        MPI_Allreduce( send, recv, n, MPI_FLOAT_INT, MPI_MAXLOC, communicator );
        for ( int i = 0; i < n; ++i ) {
            y[i]                = recv[i].f;
            comm_rank_of_max[i] = recv[i].i;
        }
        delete[] recv;
        delete[] send;
    }
    PROFILE_STOP( "maxReduce1<float>", profile_level );
}
template<>
void MPI_CLASS::call_maxReduce<float>( float *x, const int n, int *comm_rank_of_max ) const
{
    PROFILE_START( "maxReduce2<float>", profile_level );
    if ( comm_rank_of_max == nullptr ) {
        auto send = x;
        auto recv = new float[n];
        MPI_Allreduce( send, recv, n, MPI_FLOAT, MPI_MAX, communicator );
        for ( int i = 0; i < n; i++ )
            x[i] = recv[i];
        delete[] recv;
    } else {
        auto recv = new FloatIntStruct[n];
        auto send = new FloatIntStruct[n];
        for ( int i = 0; i < n; ++i ) {
            send[i].f = x[i];
            send[i].i = comm_rank;
        }
        MPI_Allreduce( send, recv, n, MPI_FLOAT_INT, MPI_MAXLOC, communicator );
        for ( int i = 0; i < n; ++i ) {
            x[i]                = recv[i].f;
            comm_rank_of_max[i] = recv[i].i;
        }
        delete[] recv;
        delete[] send;
    }
    PROFILE_STOP( "maxReduce2<float>", profile_level );
}
// double
template<>
void MPI_CLASS::call_maxReduce<double>(
    const double *x, double *y, const int n, int *comm_rank_of_max ) const
{
    PROFILE_START( "maxReduce1<double>", profile_level );
    if ( comm_rank_of_max == nullptr ) {
        MPI_Allreduce( (void *) x, (void *) y, n, MPI_DOUBLE, MPI_MAX, communicator );
    } else {
        auto recv = new DoubleIntStruct[n];
        auto send = new DoubleIntStruct[n];
        for ( int i = 0; i < n; ++i ) {
            send[i].d = x[i];
            send[i].i = comm_rank;
        }
        MPI_Allreduce( send, recv, n, MPI_DOUBLE_INT, MPI_MAXLOC, communicator );
        for ( int i = 0; i < n; ++i ) {
            y[i]                = recv[i].d;
            comm_rank_of_max[i] = recv[i].i;
        }
        delete[] recv;
        delete[] send;
    }
    PROFILE_STOP( "maxReduce1<double>", profile_level );
}
template<>
void MPI_CLASS::call_maxReduce<double>( double *x, const int n, int *comm_rank_of_max ) const
{
    PROFILE_START( "maxReduce2<double>", profile_level );
    if ( comm_rank_of_max == nullptr ) {
        auto send = x;
        auto recv = new double[n];
        MPI_Allreduce( send, recv, n, MPI_DOUBLE, MPI_MAX, communicator );
        for ( int i = 0; i < n; i++ )
            x[i] = recv[i];
        delete[] recv;
    } else {
        auto recv = new DoubleIntStruct[n];
        auto send = new DoubleIntStruct[n];
        for ( int i = 0; i < n; ++i ) {
            send[i].d = x[i];
            send[i].i = comm_rank;
        }
        MPI_Allreduce( send, recv, n, MPI_DOUBLE_INT, MPI_MAXLOC, communicator );
        for ( int i = 0; i < n; ++i ) {
            x[i]                = recv[i].d;
            comm_rank_of_max[i] = recv[i].i;
        }
        delete[] recv;
        delete[] send;
    }
    PROFILE_STOP( "maxReduce2<double>", profile_level );
}
#endif


/************************************************************************
 *  bcast                                                                *
 *  Note: these specializations are only called when using MPI.          *
 ************************************************************************/
#ifdef USE_MPI
// char
template<>
void MPI_CLASS::call_bcast<unsigned char>( unsigned char *x, const int n, const int root ) const
{
    PROFILE_START( "bcast<unsigned char>", profile_level );
    MPI_Bcast( x, n, MPI_UNSIGNED_CHAR, root, communicator );
    PROFILE_STOP( "bcast<unsigned char>", profile_level );
}
template<>
void MPI_CLASS::call_bcast<char>( char *x, const int n, const int root ) const
{
    PROFILE_START( "bcast<char>", profile_level );
    MPI_Bcast( x, n, MPI_CHAR, root, communicator );
    PROFILE_STOP( "bcast<char>", profile_level );
}
// int
template<>
void MPI_CLASS::call_bcast<unsigned int>( unsigned int *x, const int n, const int root ) const
{
    PROFILE_START( "bcast<unsigned int>", profile_level );
    MPI_Bcast( x, n, MPI_UNSIGNED, root, communicator );
    PROFILE_STOP( "bcast<unsigned int>", profile_level );
}
template<>
void MPI_CLASS::call_bcast<int>( int *x, const int n, const int root ) const
{
    PROFILE_START( "bcast<int>", profile_level );
    MPI_Bcast( x, n, MPI_INT, root, communicator );
    PROFILE_STOP( "bcast<int>", profile_level );
}
// float
template<>
void MPI_CLASS::call_bcast<float>( float *x, const int n, const int root ) const
{
    PROFILE_START( "bcast<float>", profile_level );
    MPI_Bcast( x, n, MPI_FLOAT, root, communicator );
    PROFILE_STOP( "bcast<float>", profile_level );
}
// double
template<>
void MPI_CLASS::call_bcast<double>( double *x, const int n, const int root ) const
{
    PROFILE_START( "bcast<double>", profile_level );
    MPI_Bcast( x, n, MPI_DOUBLE, root, communicator );
    PROFILE_STOP( "bcast<double>", profile_level );
}
#else
// We need a concrete instantiation of bcast<char>(x,n,root);
template<>
void MPI_CLASS::call_bcast<char>( char *, const int, const int ) const
{
}
#endif


/************************************************************************
 *  Perform a global barrier across all processors.                      *
 ************************************************************************/
void MPI_CLASS::barrier() const
{
#ifdef USE_MPI
    MPI_Barrier( communicator );
#endif
}


/************************************************************************
 *  Send data array to another processor.                                *
 *  Note: these specializations are only called when using MPI.          *
 ************************************************************************/
#ifdef USE_MPI
// char
template<>
void MPI_CLASS::send<char>(
    const char *buf, const int length, const int recv_proc_number, int tag ) const
{
    // Set the tag to 0 if it is < 0
    tag = ( tag >= 0 ) ? tag : 0;
    MPI_INSIST( tag <= d_maxTag, "Maximum tag value exceeded" );
    // Send the data
    PROFILE_START( "send<char>", profile_level );
    MPI_Send( (void *) buf, length, MPI_CHAR, recv_proc_number, tag, communicator );
    PROFILE_STOP( "send<char>", profile_level );
}
// int
template<>
void MPI_CLASS::send<int>(
    const int *buf, const int length, const int recv_proc_number, int tag ) const
{
    // Set the tag to 0 if it is < 0
    tag = ( tag >= 0 ) ? tag : 0;
    MPI_INSIST( tag <= d_maxTag, "Maximum tag value exceeded" );
    // Send the data
    PROFILE_START( "send<int>", profile_level );
    MPI_Send( (void *) buf, length, MPI_INT, recv_proc_number, tag, communicator );
    PROFILE_STOP( "send<int>", profile_level );
}
// float
template<>
void MPI_CLASS::send<float>(
    const float *buf, const int length, const int recv_proc_number, int tag ) const
{
    // Set the tag to 0 if it is < 0
    tag = ( tag >= 0 ) ? tag : 0;
    MPI_INSIST( tag <= d_maxTag, "Maximum tag value exceeded" );
    // Send the data
    PROFILE_START( "send<float>", profile_level );
    MPI_Send( (void *) buf, length, MPI_FLOAT, recv_proc_number, tag, communicator );
    PROFILE_STOP( "send<float>", profile_level );
}
// double
template<>
void MPI_CLASS::send<double>(
    const double *buf, const int length, const int recv_proc_number, int tag ) const
{
    // Set the tag to 0 if it is < 0
    tag = ( tag >= 0 ) ? tag : 0;
    MPI_INSIST( tag <= d_maxTag, "Maximum tag value exceeded" );
    // Send the data
    PROFILE_START( "send<double>", profile_level );
    MPI_Send( (void *) buf, length, MPI_DOUBLE, recv_proc_number, tag, communicator );
    PROFILE_STOP( "send<double>", profile_level );
}
#else
// We need a concrete instantiation of send for use without MPI
template<>
void MPI_CLASS::send<char>( const char *buf, const int length, const int, int tag ) const
{
    MPI_INSIST( tag <= d_maxTag, "Maximum tag value exceeded" );
    MPI_INSIST( tag >= 0, "tag must be >= 0" );
    PROFILE_START( "send<char>", profile_level );
    auto id = getRequest( communicator, tag );
    auto it = global_isendrecv_list.find( id );
    MPI_INSIST( it == global_isendrecv_list.end(),
        "send must be paired with a previous call to irecv in serial" );
    MPI_ASSERT( it->second.status == 2 );
    memcpy( (char *) it->second.data, buf, length );
    global_isendrecv_list.erase( it );
    PROFILE_START( "send<char>", profile_level );
}
#endif


/************************************************************************
 *  Non-blocking send data array to another processor.                   *
 *  Note: these specializations are only called when using MPI.          *
 ************************************************************************/
#ifdef USE_MPI
// char
template<>
MPI_Request MPI_CLASS::Isend<char>(
    const char *buf, const int length, const int recv_proc, const int tag ) const
{
    MPI_INSIST( tag <= d_maxTag, "Maximum tag value exceeded" );
    MPI_INSIST( tag >= 0, "tag must be >= 0" );
    MPI_Request request;
    PROFILE_START( "Isend<char>", profile_level );
    MPI_Isend( (void *) buf, length, MPI_CHAR, recv_proc, tag, communicator, &request );
    PROFILE_STOP( "Isend<char>", profile_level );
    return request;
}
// int
template<>
MPI_Request MPI_CLASS::Isend<int>(
    const int *buf, const int length, const int recv_proc, const int tag ) const
{
    MPI_INSIST( tag <= d_maxTag, "Maximum tag value exceeded" );
    MPI_INSIST( tag >= 0, "tag must be >= 0" );
    MPI_Request request;
    PROFILE_START( "Isend<int>", profile_level );
    MPI_Isend( (void *) buf, length, MPI_INT, recv_proc, tag, communicator, &request );
    PROFILE_STOP( "Isend<int>", profile_level );
    return request;
}
// float
template<>
MPI_Request MPI_CLASS::Isend<float>(
    const float *buf, const int length, const int recv_proc, const int tag ) const
{
    MPI_INSIST( tag <= d_maxTag, "Maximum tag value exceeded" );
    MPI_INSIST( tag >= 0, "tag must be >= 0" );
    MPI_Request request;
    PROFILE_START( "Isend<float>", profile_level );
    MPI_Isend( (void *) buf, length, MPI_FLOAT, recv_proc, tag, communicator, &request );
    PROFILE_STOP( "Isend<float>", profile_level );
    return request;
}
// double
template<>
MPI_Request MPI_CLASS::Isend<double>(
    const double *buf, const int length, const int recv_proc, const int tag ) const
{
    MPI_INSIST( tag <= d_maxTag, "Maximum tag value exceeded" );
    MPI_INSIST( tag >= 0, "tag must be >= 0" );
    MPI_Request request;
    PROFILE_START( "Isend<double>", profile_level );
    MPI_Isend( (void *) buf, length, MPI_DOUBLE, recv_proc, tag, communicator, &request );
    PROFILE_STOP( "Isend<double>", profile_level );
    return request;
}
#else
// We need a concrete instantiation of send for use without mpi
template<>
MPI_Request MPI_CLASS::Isend<char>(
    const char *buf, const int length, const int, const int tag ) const
{
    MPI_INSIST( tag <= d_maxTag, "Maximum tag value exceeded" );
    MPI_INSIST( tag >= 0, "tag must be >= 0" );
    PROFILE_START( "Isend<char>", profile_level );
    auto id = getRequest( communicator, tag );
    auto it = global_isendrecv_list.find( id );
    if ( it == global_isendrecv_list.end() ) {
        // We are calling isend first
        Isendrecv_struct data;
        data.data   = buf;
        data.status = 1;
        global_isendrecv_list.insert( std::pair<MPI_Request, Isendrecv_struct>( id, data ) );
    } else {
        // We called irecv first
        MPI_ASSERT( it->second.status == 2 );
        memcpy( (char *) it->second.data, buf, length );
        global_isendrecv_list.erase( it );
    }
    PROFILE_STOP( "Isend<char>", profile_level );
    return id;
}
#endif


/************************************************************************
 *  Send byte array to another processor.                                *
 ************************************************************************/
void MPI_CLASS::sendBytes(
    const void *buf, const int number_bytes, const int recv_proc_number, int tag ) const
{
    MPI_INSIST( tag <= d_maxTag, "Maximum tag value exceeded" );
    MPI_INSIST( tag >= 0, "tag must be >= 0" );
    send<char>( (const char *) buf, number_bytes, recv_proc_number, tag );
}


/************************************************************************
 *  Non-blocking send byte array to another processor.                   *
 ************************************************************************/
MPI_Request MPI_CLASS::IsendBytes(
    const void *buf, const int number_bytes, const int recv_proc, const int tag ) const
{
    MPI_INSIST( tag <= d_maxTag, "Maximum tag value exceeded" );
    MPI_INSIST( tag >= 0, "tag must be >= 0" );
    return Isend<char>( (const char *) buf, number_bytes, recv_proc, tag );
}


/************************************************************************
 *  Recieve data array to another processor.                             *
 *  Note: these specializations are only called when using MPI.          *
 ************************************************************************/
#ifdef USE_MPI
// char
template<>
void MPI_CLASS::recv<char>(
    char *buf, int &length, const int send_proc_number, const bool get_length, int tag ) const
{
    // Set the tag to 0 if it is < 0
    tag = ( tag >= 0 ) ? tag : 0;
    MPI_INSIST( tag <= d_maxTag, "Maximum tag value exceeded" );
    PROFILE_START( "recv<char>", profile_level );
    // Get the recieve length if necessary
    if ( get_length ) {
        int bytes       = this->probe( send_proc_number, tag );
        int recv_length = bytes / sizeof( char );
        MPI_INSIST( length >= recv_length, "Recived length is larger than allocated array" );
        length = recv_length;
    }
    // Send the data
    MPI_Status status;
    MPI_Recv( (void *) buf, length, MPI_CHAR, send_proc_number, tag, communicator, &status );
    PROFILE_STOP( "recv<char>", profile_level );
}
// int
template<>
void MPI_CLASS::recv<int>(
    int *buf, int &length, const int send_proc_number, const bool get_length, int tag ) const
{
    // Set the tag to 0 if it is < 0
    tag = ( tag >= 0 ) ? tag : 0;
    MPI_INSIST( tag <= d_maxTag, "Maximum tag value exceeded" );
    PROFILE_START( "recv<int>", profile_level );
    // Get the recieve length if necessary
    if ( get_length ) {
        int bytes       = this->probe( send_proc_number, tag );
        int recv_length = bytes / sizeof( int );
        MPI_INSIST( length >= recv_length, "Recived length is larger than allocated array" );
        length = recv_length;
    }
    // Send the data
    MPI_Status status;
    MPI_Recv( (void *) buf, length, MPI_INT, send_proc_number, tag, communicator, &status );
    PROFILE_STOP( "recv<int>", profile_level );
}
// float
template<>
void MPI_CLASS::recv<float>(
    float *buf, int &length, const int send_proc_number, const bool get_length, int tag ) const
{
    // Set the tag to 0 if it is < 0
    tag = ( tag >= 0 ) ? tag : 0;
    MPI_INSIST( tag <= d_maxTag, "Maximum tag value exceeded" );
    PROFILE_START( "recv<float>", profile_level );
    // Get the recieve length if necessary
    if ( get_length ) {
        int bytes       = this->probe( send_proc_number, tag );
        int recv_length = bytes / sizeof( float );
        MPI_INSIST( length >= recv_length, "Recived length is larger than allocated array" );
        length = recv_length;
    }
    // Send the data
    MPI_Status status;
    MPI_Recv( (void *) buf, length, MPI_FLOAT, send_proc_number, tag, communicator, &status );
    PROFILE_STOP( "recv<float>", profile_level );
}
// double
template<>
void MPI_CLASS::recv<double>(
    double *buf, int &length, const int send_proc_number, const bool get_length, int tag ) const
{
    // Set the tag to 0 if it is < 0
    tag = ( tag >= 0 ) ? tag : 0;
    MPI_INSIST( tag <= d_maxTag, "Maximum tag value exceeded" );
    PROFILE_START( "recv<double>", profile_level );
    // Get the recieve length if necessary
    if ( get_length ) {
        int bytes       = this->probe( send_proc_number, tag );
        int recv_length = bytes / sizeof( double );
        MPI_INSIST( length >= recv_length, "Recived length is larger than allocated array" );
        length = recv_length;
    }
    // Send the data
    MPI_Status status;
    MPI_Recv( (void *) buf, length, MPI_DOUBLE, send_proc_number, tag, communicator, &status );
    PROFILE_STOP( "recv<double>", profile_level );
}
#else
// We need a concrete instantiation of recv for use without mpi
template<>
void MPI_CLASS::recv<char>( char *buf, int &length, const int, const bool, int tag ) const
{
    MPI_INSIST( tag <= d_maxTag, "Maximum tag value exceeded" );
    MPI_INSIST( tag >= 0, "tag must be >= 0" );
    PROFILE_START( "recv<char>", profile_level );
    auto id = getRequest( communicator, tag );
    auto it = global_isendrecv_list.find( id );
    MPI_INSIST( it != global_isendrecv_list.end(),
        "recv must be paired with a previous call to isend in serial" );
    MPI_ASSERT( it->second.status == 1 );
    memcpy( buf, it->second.data, length );
    global_isendrecv_list.erase( it );
    PROFILE_STOP( "recv<char>", profile_level );
}
#endif


/************************************************************************
 *  Non-blocking recieve data array to another processor.                *
 *  Note: these specializations are only called when using MPI.          *
 ************************************************************************/
#ifdef USE_MPI
// char
template<>
MPI_Request MPI_CLASS::Irecv<char>(
    char *buf, const int length, const int send_proc, const int tag ) const
{
    MPI_INSIST( tag <= d_maxTag, "Maximum tag value exceeded" );
    MPI_INSIST( tag >= 0, "tag must be >= 0" );
    MPI_Request request;
    PROFILE_START( "Irecv<char>", profile_level );
    MPI_Irecv( (void *) buf, length, MPI_CHAR, send_proc, tag, communicator, &request );
    PROFILE_STOP( "Irecv<char>", profile_level );
    return request;
}
// int
template<>
MPI_Request MPI_CLASS::Irecv<int>(
    int *buf, const int length, const int send_proc, const int tag ) const
{
    MPI_INSIST( tag <= d_maxTag, "Maximum tag value exceeded" );
    MPI_INSIST( tag >= 0, "tag must be >= 0" );
    MPI_Request request;
    PROFILE_START( "Irecv<int>", profile_level );
    MPI_Irecv( (void *) buf, length, MPI_INT, send_proc, tag, communicator, &request );
    PROFILE_STOP( "Irecv<int>", profile_level );
    return request;
}
// float
template<>
MPI_Request MPI_CLASS::Irecv<float>(
    float *buf, const int length, const int send_proc, const int tag ) const
{
    MPI_INSIST( tag <= d_maxTag, "Maximum tag value exceeded" );
    MPI_INSIST( tag >= 0, "tag must be >= 0" );
    MPI_Request request;
    PROFILE_START( "Irecv<float>", profile_level );
    MPI_Irecv( (void *) buf, length, MPI_FLOAT, send_proc, tag, communicator, &request );
    PROFILE_STOP( "Irecv<float>", profile_level );
    return request;
}
// double
template<>
MPI_Request MPI_CLASS::Irecv<double>(
    double *buf, const int length, const int send_proc, const int tag ) const
{
    MPI_INSIST( tag <= d_maxTag, "Maximum tag value exceeded" );
    MPI_INSIST( tag >= 0, "tag must be >= 0" );
    MPI_Request request;
    PROFILE_START( "Irecv<double>", profile_level );
    MPI_Irecv( (void *) buf, length, MPI_DOUBLE, send_proc, tag, communicator, &request );
    PROFILE_STOP( "Irecv<double>", profile_level );
    return request;
}
#else
// We need a concrete instantiation of irecv for use without mpi
template<>
MPI_Request MPI_CLASS::Irecv<char>( char *buf, const int length, const int, const int tag ) const
{
    MPI_INSIST( tag <= d_maxTag, "Maximum tag value exceeded" );
    MPI_INSIST( tag >= 0, "tag must be >= 0" );
    PROFILE_START( "Irecv<char>", profile_level );
    auto id = getRequest( communicator, tag );
    auto it = global_isendrecv_list.find( id );
    if ( it == global_isendrecv_list.end() ) {
        // We are calling Irecv first
        Isendrecv_struct data;
        data.data   = buf;
        data.status = 2;
        global_isendrecv_list.insert( std::pair<MPI_Request, Isendrecv_struct>( id, data ) );
    } else {
        // We called Isend first
        MPI_ASSERT( it->second.status == 1 );
        memcpy( buf, it->second.data, length );
        global_isendrecv_list.erase( it );
    }
    PROFILE_STOP( "Irecv<char>", profile_level );
    return id;
}
#endif


/************************************************************************
 *  Recieve byte array to another processor.                             *
 ************************************************************************/
void MPI_CLASS::recvBytes( void *buf, int &number_bytes, const int send_proc, int tag ) const
{
    recv<char>( (char *) buf, number_bytes, send_proc, false, tag );
}


/************************************************************************
 *  Recieve byte array to another processor.                             *
 ************************************************************************/
MPI_Request MPI_CLASS::IrecvBytes(
    void *buf, const int number_bytes, const int send_proc, const int tag ) const
{
    MPI_INSIST( tag <= d_maxTag, "Maximum tag value exceeded" );
    MPI_INSIST( tag >= 0, "tag must be >= 0" );
    return Irecv<char>( (char *) buf, number_bytes, send_proc, tag );
}


/************************************************************************
 *  sendrecv                                                             *
 ************************************************************************/
#if defined( USE_MPI ) || defined( USE_EXT_MPI )
template<>
void MPI_CLASS::sendrecv<char>( const char* sendbuf, int sendcount, int dest, int sendtag,
                                char* recvbuf, int recvcount, int source, int recvtag ) const
{
    PROFILE_START( "sendrecv<char>", profile_level );
    MPI_Sendrecv( sendbuf, sendcount, MPI_CHAR, dest, sendtag,
                  recvbuf, recvcount, MPI_CHAR, source, recvtag,
                  communicator, MPI_STATUS_IGNORE );
    PROFILE_STOP( "sendrecv<char>", profile_level );
}
template<>
void MPI_CLASS::sendrecv<int>( const int* sendbuf, int sendcount, int dest, int sendtag,
                               int* recvbuf, int recvcount, int source, int recvtag ) const
{
    PROFILE_START( "sendrecv<int>", profile_level );
    MPI_Sendrecv( sendbuf, sendcount, MPI_INT, dest, sendtag,
                  recvbuf, recvcount, MPI_INT, source, recvtag,
                  communicator, MPI_STATUS_IGNORE );
    PROFILE_STOP( "sendrecv<int>", profile_level );
}
template<>
void MPI_CLASS::sendrecv<float>( const float* sendbuf, int sendcount, int dest, int sendtag,
                                 float* recvbuf, int recvcount, int source, int recvtag ) const
{
    PROFILE_START( "sendrecv<float>", profile_level );
    MPI_Sendrecv( sendbuf, sendcount, MPI_FLOAT, dest, sendtag,
                  recvbuf, recvcount, MPI_FLOAT, source, recvtag,
                  communicator, MPI_STATUS_IGNORE );
    PROFILE_STOP( "sendrecv<float>", profile_level );
}
template<>
void MPI_CLASS::sendrecv<double>( const double* sendbuf, int sendcount, int dest, int sendtag,
                                  double* recvbuf, int recvcount, int source, int recvtag ) const
{
    PROFILE_START( "sendrecv<double>", profile_level );
    MPI_Sendrecv( sendbuf, sendcount, MPI_DOUBLE, dest, sendtag,
                  recvbuf, recvcount, MPI_DOUBLE, source, recvtag,
                  communicator, MPI_STATUS_IGNORE );
    PROFILE_STOP( "sendrecv<double>", profile_level );
}
#endif


/************************************************************************
 *  allGather                                                            *
 *  Note: these specializations are only called when using MPI.          *
 ************************************************************************/
#ifdef USE_MPI
// unsigned char
template<>
void MPI_CLASS::call_allGather<unsigned char>(
    const unsigned char &x_in, unsigned char *x_out ) const
{
    PROFILE_START( "allGather<unsigned char>", profile_level );
    MPI_Allgather(
        (void *) &x_in, 1, MPI_UNSIGNED_CHAR, (void *) x_out, 1, MPI_UNSIGNED_CHAR, communicator );
    PROFILE_STOP( "allGather<unsigned char>", profile_level );
}
template<>
void MPI_CLASS::call_allGather<unsigned char>( const unsigned char *x_in, int size_in,
    unsigned char *x_out, int *size_out, int *disp_out ) const
{
    PROFILE_START( "allGatherv<unsigned char>", profile_level );
    MPI_Allgatherv( (void *) x_in, size_in, MPI_CHAR, (void *) x_out, size_out, disp_out, MPI_CHAR,
        communicator );
    PROFILE_STOP( "allGatherv<unsigned char>", profile_level );
}
// char
template<>
void MPI_CLASS::call_allGather<char>( const char &x_in, char *x_out ) const
{
    PROFILE_START( "allGather<char>", profile_level );
    MPI_Allgather( (void *) &x_in, 1, MPI_CHAR, (void *) x_out, 1, MPI_CHAR, communicator );
    PROFILE_STOP( "allGather<char>", profile_level );
}
template<>
void MPI_CLASS::call_allGather<char>(
    const char *x_in, int size_in, char *x_out, int *size_out, int *disp_out ) const
{
    PROFILE_START( "allGatherv<char>", profile_level );
    MPI_Allgatherv( (void *) x_in, size_in, MPI_CHAR, (void *) x_out, size_out, disp_out, MPI_CHAR,
        communicator );
    PROFILE_STOP( "allGatherv<char>", profile_level );
}
// unsigned int
template<>
void MPI_CLASS::call_allGather<unsigned int>( const unsigned int &x_in, unsigned int *x_out ) const
{
    PROFILE_START( "allGather<unsigned int>", profile_level );
    MPI_Allgather( (void *) &x_in, 1, MPI_UNSIGNED, (void *) x_out, 1, MPI_UNSIGNED, communicator );
    PROFILE_STOP( "allGather<unsigned int>", profile_level );
}
template<>
void MPI_CLASS::call_allGather<unsigned int>(
    const unsigned int *x_in, int size_in, unsigned int *x_out, int *size_out, int *disp_out ) const
{
    PROFILE_START( "allGatherv<unsigned int>", profile_level );
    MPI_Allgatherv( (void *) x_in, size_in, MPI_UNSIGNED, (void *) x_out, size_out, disp_out,
        MPI_UNSIGNED, communicator );
    PROFILE_STOP( "allGatherv<unsigned int>", profile_level );
}
// int
template<>
void MPI_CLASS::call_allGather<int>( const int &x_in, int *x_out ) const
{
    PROFILE_START( "allGather<int>", profile_level );
    MPI_Allgather( (void *) &x_in, 1, MPI_INT, (void *) x_out, 1, MPI_INT, communicator );
    PROFILE_STOP( "allGather<int>", profile_level );
}
template<>
void MPI_CLASS::call_allGather<int>(
    const int *x_in, int size_in, int *x_out, int *size_out, int *disp_out ) const
{
    PROFILE_START( "allGatherv<int>", profile_level );
    MPI_Allgatherv( (void *) x_in, size_in, MPI_INT, (void *) x_out, size_out, disp_out, MPI_INT,
        communicator );
    PROFILE_STOP( "allGatherv<int>", profile_level );
}
// unsigned long int
template<>
void MPI_CLASS::call_allGather<unsigned long int>(
    const unsigned long int &x_in, unsigned long int *x_out ) const
{
    PROFILE_START( "allGather<unsigned long>", profile_level );
    MPI_Allgather(
        (void *) &x_in, 1, MPI_UNSIGNED_LONG, (void *) x_out, 1, MPI_UNSIGNED_LONG, communicator );
    PROFILE_STOP( "allGather<unsigned long>", profile_level );
}
template<>
void MPI_CLASS::call_allGather<unsigned long int>( const unsigned long int *x_in, int size_in,
    unsigned long int *x_out, int *size_out, int *disp_out ) const
{
    PROFILE_START( "allGatherv<unsigned long>", profile_level );
    MPI_Allgatherv( (void *) x_in, size_in, MPI_UNSIGNED_LONG, (void *) x_out, size_out, disp_out,
        MPI_UNSIGNED_LONG, communicator );
    PROFILE_STOP( "allGatherv<unsigned long>", profile_level );
}
// long int
template<>
void MPI_CLASS::call_allGather<long int>( const long int &x_in, long int *x_out ) const
{
    PROFILE_START( "allGather<long int>", profile_level );
    MPI_Allgather( (void *) &x_in, 1, MPI_LONG, (void *) x_out, 1, MPI_LONG, communicator );
    PROFILE_STOP( "allGather<long int>", profile_level );
}
template<>
void MPI_CLASS::call_allGather<long int>(
    const long int *x_in, int size_in, long int *x_out, int *size_out, int *disp_out ) const
{
    PROFILE_START( "allGatherv<long int>", profile_level );
    MPI_Allgatherv( (void *) x_in, size_in, MPI_LONG, (void *) x_out, size_out, disp_out, MPI_LONG,
        communicator );
    PROFILE_STOP( "allGatherv<long int>", profile_level );
}
// float
template<>
void MPI_CLASS::call_allGather<float>( const float &x_in, float *x_out ) const
{
    PROFILE_START( "allGather<float>", profile_level );
    MPI_Allgather( (void *) &x_in, 1, MPI_FLOAT, (void *) x_out, 1, MPI_FLOAT, communicator );
    PROFILE_STOP( "allGather<float>", profile_level );
}
template<>
void MPI_CLASS::call_allGather<float>(
    const float *x_in, int size_in, float *x_out, int *size_out, int *disp_out ) const
{
    PROFILE_START( "allGatherv<float>", profile_level );
    MPI_Allgatherv( (void *) x_in, size_in, MPI_FLOAT, (void *) x_out, size_out, disp_out,
        MPI_FLOAT, communicator );
    PROFILE_STOP( "allGatherv<float>", profile_level );
}
// double
template<>
void MPI_CLASS::call_allGather<double>( const double &x_in, double *x_out ) const
{
    PROFILE_START( "allGather<double>", profile_level );
    MPI_Allgather( (void *) &x_in, 1, MPI_DOUBLE, (void *) x_out, 1, MPI_DOUBLE, communicator );
    PROFILE_STOP( "allGather<double>", profile_level );
}
template<>
void MPI_CLASS::call_allGather<double>(
    const double *x_in, int size_in, double *x_out, int *size_out, int *disp_out ) const
{
    PROFILE_START( "allGatherv<double>", profile_level );
    MPI_Allgatherv( (void *) x_in, size_in, MPI_DOUBLE, (void *) x_out, size_out, disp_out,
        MPI_DOUBLE, communicator );
    PROFILE_STOP( "allGatherv<double>", profile_level );
}
#else
// We need a concrete instantiation of call_allGather<char>(x_in,size_in,x_out,size_out)
template<>
void MPI_CLASS::call_allGather<char>( const char *, int, char *, int *, int * ) const
{
    MPI_ERROR( "Internal error in communicator (allGather) " );
}
#endif


/************************************************************************
 *  allToAll                                                             *
 *  Note: these specializations are only called when using MPI.          *
 ************************************************************************/
#ifdef USE_MPI
template<>
void MPI_CLASS::allToAll<unsigned char>(
    const int n, const unsigned char *send, unsigned char *recv ) const
{
    PROFILE_START( "allToAll<unsigned char>", profile_level );
    MPI_Alltoall(
        (void *) send, n, MPI_UNSIGNED_CHAR, (void *) recv, n, MPI_UNSIGNED_CHAR, communicator );
    PROFILE_STOP( "allToAll<unsigned char>", profile_level );
}
template<>
void MPI_CLASS::allToAll<char>( const int n, const char *send, char *recv ) const
{
    PROFILE_START( "allToAll<char>", profile_level );
    MPI_Alltoall( (void *) send, n, MPI_CHAR, (void *) recv, n, MPI_CHAR, communicator );
    PROFILE_STOP( "allToAll<char>", profile_level );
}
template<>
void MPI_CLASS::allToAll<unsigned int>(
    const int n, const unsigned int *send, unsigned int *recv ) const
{
    PROFILE_START( "allToAll<unsigned int>", profile_level );
    MPI_Alltoall( (void *) send, n, MPI_UNSIGNED, (void *) recv, n, MPI_UNSIGNED, communicator );
    PROFILE_STOP( "allToAll<unsigned int>", profile_level );
}
template<>
void MPI_CLASS::allToAll<int>( const int n, const int *send, int *recv ) const
{
    PROFILE_START( "allToAll<int>", profile_level );
    MPI_Alltoall( (void *) send, n, MPI_INT, (void *) recv, n, MPI_INT, communicator );
    PROFILE_STOP( "allToAll<int>", profile_level );
}
template<>
void MPI_CLASS::allToAll<unsigned long int>(
    const int n, const unsigned long int *send, unsigned long int *recv ) const
{
    PROFILE_START( "allToAll<unsigned long>", profile_level );
    MPI_Alltoall(
        (void *) send, n, MPI_UNSIGNED_LONG, (void *) recv, n, MPI_UNSIGNED_LONG, communicator );
    PROFILE_STOP( "allToAll<unsigned long>", profile_level );
}
template<>
void MPI_CLASS::allToAll<long int>( const int n, const long int *send, long int *recv ) const
{
    PROFILE_START( "allToAll<long int>", profile_level );
    MPI_Alltoall( (void *) send, n, MPI_LONG, (void *) recv, n, MPI_LONG, communicator );
    PROFILE_STOP( "allToAll<long int>", profile_level );
}
template<>
void MPI_CLASS::allToAll<float>( const int n, const float *send, float *recv ) const
{
    PROFILE_START( "allToAll<float>", profile_level );
    MPI_Alltoall( (void *) send, n, MPI_FLOAT, (void *) recv, n, MPI_FLOAT, communicator );
    PROFILE_STOP( "allToAll<float>", profile_level );
}
template<>
void MPI_CLASS::allToAll<double>( const int n, const double *send, double *recv ) const
{
    PROFILE_START( "allToAll<double>", profile_level );
    MPI_Alltoall( (void *) send, n, MPI_DOUBLE, (void *) recv, n, MPI_DOUBLE, communicator );
    PROFILE_STOP( "allToAll<double>", profile_level );
}
#endif


/************************************************************************
 *  call_allToAll                                                        *
 *  Note: these specializations are only called when using MPI.          *
 ************************************************************************/
#ifdef USE_MPI
// unsigned char
template<>
void MPI_CLASS::call_allToAll<unsigned char>( const unsigned char *send_data, const int send_cnt[],
    const int send_disp[], unsigned char *recv_data, const int *recv_cnt,
    const int *recv_disp ) const
{
    PROFILE_START( "allToAllv<unsigned char>", profile_level );
    MPI_Alltoallv( (void *) send_data, (int *) send_cnt, (int *) send_disp, MPI_UNSIGNED_CHAR,
        (void *) recv_data, (int *) recv_cnt, (int *) recv_disp, MPI_UNSIGNED_CHAR, communicator );
    PROFILE_STOP( "allToAllv<unsigned char>", profile_level );
}
// char
template<>
void MPI_CLASS::call_allToAll<char>( const char *send_data, const int send_cnt[],
    const int send_disp[], char *recv_data, const int *recv_cnt, const int *recv_disp ) const
{
    PROFILE_START( "allToAllv<char>", profile_level );
    MPI_Alltoallv( (void *) send_data, (int *) send_cnt, (int *) send_disp, MPI_CHAR,
        (void *) recv_data, (int *) recv_cnt, (int *) recv_disp, MPI_CHAR, communicator );
    PROFILE_STOP( "allToAllv<char>", profile_level );
}
// unsigned int
template<>
void MPI_CLASS::call_allToAll<unsigned int>( const unsigned int *send_data, const int send_cnt[],
    const int send_disp[], unsigned int *recv_data, const int *recv_cnt,
    const int *recv_disp ) const
{
    PROFILE_START( "allToAllv<unsigned int>", profile_level );
    MPI_Alltoallv( (void *) send_data, (int *) send_cnt, (int *) send_disp, MPI_UNSIGNED,
        (void *) recv_data, (int *) recv_cnt, (int *) recv_disp, MPI_UNSIGNED, communicator );
    PROFILE_STOP( "allToAllv<unsigned int>", profile_level );
}
// int
template<>
void MPI_CLASS::call_allToAll<int>( const int *send_data, const int send_cnt[],
    const int send_disp[], int *recv_data, const int *recv_cnt, const int *recv_disp ) const
{
    PROFILE_START( "allToAllv<int>", profile_level );
    MPI_Alltoallv( (void *) send_data, (int *) send_cnt, (int *) send_disp, MPI_INT,
        (void *) recv_data, (int *) recv_cnt, (int *) recv_disp, MPI_INT, communicator );
    PROFILE_STOP( "allToAllv<int>", profile_level );
}
// unsigned long int
template<>
void MPI_CLASS::call_allToAll<unsigned long int>( const unsigned long int *send_data,
    const int send_cnt[], const int send_disp[], unsigned long int *recv_data, const int *recv_cnt,
    const int *recv_disp ) const
{
    PROFILE_START( "allToAllv<unsigned long>", profile_level );
    MPI_Alltoallv( (void *) send_data, (int *) send_cnt, (int *) send_disp, MPI_UNSIGNED_LONG,
        (void *) recv_data, (int *) recv_cnt, (int *) recv_disp, MPI_UNSIGNED_LONG, communicator );
    PROFILE_STOP( "allToAllv<unsigned long>", profile_level );
}
// long int
template<>
void MPI_CLASS::call_allToAll<long int>( const long int *send_data, const int send_cnt[],
    const int send_disp[], long int *recv_data, const int *recv_cnt, const int *recv_disp ) const
{
    PROFILE_START( "allToAllv<long int>", profile_level );
    MPI_Alltoallv( (void *) send_data, (int *) send_cnt, (int *) send_disp, MPI_LONG,
        (void *) recv_data, (int *) recv_cnt, (int *) recv_disp, MPI_LONG, communicator );
    PROFILE_STOP( "allToAllv<long int>", profile_level );
}
// float
template<>
void MPI_CLASS::call_allToAll<float>( const float *send_data, const int send_cnt[],
    const int send_disp[], float *recv_data, const int *recv_cnt, const int *recv_disp ) const
{
    PROFILE_START( "allToAllv<float>", profile_level );
    MPI_Alltoallv( (void *) send_data, (int *) send_cnt, (int *) send_disp, MPI_FLOAT,
        (void *) recv_data, (int *) recv_cnt, (int *) recv_disp, MPI_FLOAT, communicator );
    PROFILE_STOP( "allToAllv<float>", profile_level );
}
// double
template<>
void MPI_CLASS::call_allToAll<double>( const double *send_data, const int send_cnt[],
    const int send_disp[], double *recv_data, const int *recv_cnt, const int *recv_disp ) const
{
    PROFILE_START( "allToAllv<double>", profile_level );
    MPI_Alltoallv( (void *) send_data, (int *) send_cnt, (int *) send_disp, MPI_DOUBLE,
        (void *) recv_data, (int *) recv_cnt, (int *) recv_disp, MPI_DOUBLE, communicator );
    PROFILE_STOP( "allToAllv<double>", profile_level );
}
#else
// Default instatiation of unsigned char
template<>
void MPI_CLASS::call_allToAll<char>(
    const char *, const int[], const int[], char *, const int *, const int * ) const
{
    MPI_ERROR( "Should not reach this point" );
}
#endif


/************************************************************************
 *  call_sumScan                                                         *
 *  Note: these specializations are only called when using MPI.          *
 ************************************************************************/
#ifdef USE_MPI
// unsigned char
template<>
void MPI_CLASS::call_sumScan<unsigned char>(
    const unsigned char *send, unsigned char *recv, int n ) const
{
    PROFILE_START( "sumScan<unsigned char>", profile_level );
    MPI_Scan( (void *) send, (void *) recv, n, MPI_UNSIGNED_CHAR, MPI_SUM, communicator );
    PROFILE_STOP( "sumScan<unsigned char>", profile_level );
}
// char
template<>
void MPI_CLASS::call_sumScan<char>( const char *send, char *recv, int n ) const
{
    PROFILE_START( "sumScan<char>", profile_level );
    MPI_Scan( (void *) send, (void *) recv, n, MPI_SIGNED_CHAR, MPI_SUM, communicator );
    PROFILE_STOP( "sumScan<char>", profile_level );
}
// unsigned int
template<>
void MPI_CLASS::call_sumScan<unsigned int>(
    const unsigned int *send, unsigned int *recv, int n ) const
{
    PROFILE_START( "sumScan<unsigned int>", profile_level );
    MPI_Scan( (void *) send, (void *) recv, n, MPI_UNSIGNED, MPI_SUM, communicator );
    PROFILE_STOP( "sumScan<unsigned int>", profile_level );
}
// int
template<>
void MPI_CLASS::call_sumScan<int>( const int *send, int *recv, int n ) const
{
    PROFILE_START( "sumScan<int>", profile_level );
    MPI_Scan( (void *) send, (void *) recv, n, MPI_INT, MPI_SUM, communicator );
    PROFILE_STOP( "sumScan<int>", profile_level );
}
// long int
template<>
void MPI_CLASS::call_sumScan<long int>( const long int *send, long int *recv, int n ) const
{
    PROFILE_START( "sumScan<long int>", profile_level );
    MPI_Scan( (void *) send, (void *) recv, n, MPI_LONG, MPI_SUM, communicator );
    PROFILE_STOP( "sumScan<long int>", profile_level );
}
// unsigned long int
template<>
void MPI_CLASS::call_sumScan<unsigned long>(
    const unsigned long *send, unsigned long *recv, int n ) const
{
    PROFILE_START( "sumScan<unsigned long>", profile_level );
    MPI_Scan( (void *) send, (void *) recv, n, MPI_UNSIGNED_LONG, MPI_SUM, communicator );
    PROFILE_STOP( "sumScan<unsigned long>", profile_level );
}
// size_t
#ifdef USE_WINDOWS
template<>
void MPI_CLASS::call_sumScan<size_t>( const size_t *send, size_t *recv, int n ) const
{
    MPI_ASSERT( MPI_SIZE_T != 0 );
    PROFILE_START( "sumScan<size_t>", profile_level );
    MPI_Scan( (void *) send, (void *) recv, n, MPI_SIZE_T, MPI_SUM, communicator );
    PROFILE_STOP( "sumScan<size_t>", profile_level );
}
#endif
// float
template<>
void MPI_CLASS::call_sumScan<float>( const float *send, float *recv, int n ) const
{
    PROFILE_START( "sumScan<float>", profile_level );
    MPI_Scan( (void *) send, (void *) recv, n, MPI_FLOAT, MPI_SUM, communicator );
    PROFILE_STOP( "sumScan<float>", profile_level );
}
// double
template<>
void MPI_CLASS::call_sumScan<double>( const double *send, double *recv, int n ) const
{
    PROFILE_START( "sumScan<double>", profile_level );
    MPI_Scan( (void *) send, (void *) recv, n, MPI_DOUBLE, MPI_SUM, communicator );
    PROFILE_STOP( "sumScan<double>", profile_level );
}
// std::complex<double>
template<>
void MPI_CLASS::call_sumScan<std::complex<double>>(
    const std::complex<double> *x, std::complex<double> *y, int n ) const
{
    auto send = new double[2 * n];
    auto recv = new double[2 * n];
    for ( int i = 0; i < n; i++ ) {
        send[2 * i + 0] = real( x[i] );
        send[2 * i + 1] = imag( x[i] );
    }
    MPI_Scan( (void *) send, (void *) recv, 2 * n, MPI_DOUBLE, MPI_SUM, communicator );
    for ( int i = 0; i < n; i++ )
        y[i] = std::complex<double>( recv[2 * i + 0], recv[2 * i + 1] );
    delete[] send;
    delete[] recv;
}
#endif


/************************************************************************
 *  call_minScan                                                         *
 *  Note: these specializations are only called when using MPI.          *
 ************************************************************************/
#ifdef USE_MPI
// unsigned char
template<>
void MPI_CLASS::call_minScan<unsigned char>(
    const unsigned char *send, unsigned char *recv, int n ) const
{
    PROFILE_START( "minScan<unsigned char>", profile_level );
    MPI_Scan( (void *) send, (void *) recv, n, MPI_UNSIGNED_CHAR, MPI_MIN, communicator );
    PROFILE_STOP( "minScan<unsigned char>", profile_level );
}
// char
template<>
void MPI_CLASS::call_minScan<char>( const char *send, char *recv, int n ) const
{
    PROFILE_START( "minScan<char>", profile_level );
    MPI_Scan( (void *) send, (void *) recv, n, MPI_SIGNED_CHAR, MPI_MIN, communicator );
    PROFILE_STOP( "minScan<char>", profile_level );
}
// unsigned int
template<>
void MPI_CLASS::call_minScan<unsigned int>(
    const unsigned int *send, unsigned int *recv, int n ) const
{
    PROFILE_START( "minScan<unsigned int>", profile_level );
    MPI_Scan( (void *) send, (void *) recv, n, MPI_UNSIGNED, MPI_MIN, communicator );
    PROFILE_STOP( "minScan<unsigned int>", profile_level );
}
// int
template<>
void MPI_CLASS::call_minScan<int>( const int *send, int *recv, int n ) const
{
    PROFILE_START( "minScan<int>", profile_level );
    MPI_Scan( (void *) send, (void *) recv, n, MPI_INT, MPI_MIN, communicator );
    PROFILE_STOP( "minScan<int>", profile_level );
}
// unsigned long int
template<>
void MPI_CLASS::call_minScan<unsigned long int>(
    const unsigned long int *send, unsigned long int *recv, int n ) const
{
    PROFILE_START( "minScan<unsigned long>", profile_level );
    MPI_Scan( (void *) send, (void *) recv, n, MPI_UNSIGNED_LONG, MPI_MIN, communicator );
    PROFILE_STOP( "minScan<unsigned long>", profile_level );
}
// long int
template<>
void MPI_CLASS::call_minScan<long int>( const long int *send, long int *recv, int n ) const
{
    PROFILE_START( "minScan<long int>", profile_level );
    MPI_Scan( (void *) send, (void *) recv, n, MPI_LONG, MPI_MIN, communicator );
    PROFILE_STOP( "minScan<long int>", profile_level );
}
// size_t
#ifdef USE_WINDOWS
template<>
void MPI_CLASS::call_minScan<size_t>( const size_t *send, size_t *recv, int n ) const
{
    MPI_ASSERT( MPI_SIZE_T != 0 );
    PROFILE_START( "minScan<size_t>", profile_level );
    MPI_Scan( (void *) send, (void *) recv, n, MPI_SIZE_T, MPI_MIN, communicator );
    PROFILE_STOP( "minScan<size_t>", profile_level );
}
#endif
// float
template<>
void MPI_CLASS::call_minScan<float>( const float *send, float *recv, int n ) const
{
    PROFILE_START( "minScan<float>", profile_level );
    MPI_Scan( (void *) send, (void *) recv, n, MPI_FLOAT, MPI_MIN, communicator );
    PROFILE_STOP( "minScan<float>", profile_level );
}
// double
template<>
void MPI_CLASS::call_minScan<double>( const double *send, double *recv, int n ) const
{
    PROFILE_START( "minScan<double>", profile_level );
    MPI_Scan( (void *) send, (void *) recv, n, MPI_DOUBLE, MPI_MIN, communicator );
    PROFILE_STOP( "minScan<double>", profile_level );
}
#endif


/************************************************************************
 *  call_maxScan                                                         *
 *  Note: these specializations are only called when using MPI.          *
 ************************************************************************/
#ifdef USE_MPI
// unsigned char
template<>
void MPI_CLASS::call_maxScan<unsigned char>(
    const unsigned char *send, unsigned char *recv, int n ) const
{
    PROFILE_START( "maxScan<unsigned char>", profile_level );
    MPI_Scan( (void *) send, (void *) recv, n, MPI_UNSIGNED_CHAR, MPI_MAX, communicator );
    PROFILE_STOP( "maxScan<unsigned char>", profile_level );
}
// char
template<>
void MPI_CLASS::call_maxScan<char>( const char *send, char *recv, int n ) const
{
    PROFILE_START( "maxScan<char>", profile_level );
    MPI_Scan( (void *) send, (void *) recv, n, MPI_SIGNED_CHAR, MPI_MAX, communicator );
    PROFILE_STOP( "maxScan<char>", profile_level );
}
// unsigned int
template<>
void MPI_CLASS::call_maxScan<unsigned int>(
    const unsigned int *send, unsigned int *recv, int n ) const
{
    PROFILE_START( "maxScan<unsigned int>", profile_level );
    MPI_Scan( (void *) send, (void *) recv, n, MPI_UNSIGNED, MPI_MAX, communicator );
    PROFILE_STOP( "maxScan<unsigned int>", profile_level );
}
// int
template<>
void MPI_CLASS::call_maxScan<int>( const int *send, int *recv, int n ) const
{
    PROFILE_START( "maxScan<int>", profile_level );
    MPI_Scan( (void *) send, (void *) recv, n, MPI_INT, MPI_MAX, communicator );
    PROFILE_STOP( "maxScan<int>", profile_level );
}
// long int
template<>
void MPI_CLASS::call_maxScan<long int>( const long int *send, long int *recv, int n ) const
{
    PROFILE_START( "maxScan<long int>", profile_level );
    MPI_Scan( (void *) send, (void *) recv, n, MPI_LONG, MPI_MAX, communicator );
    PROFILE_STOP( "maxScan<long int>", profile_level );
}
// unsigned long int
template<>
void MPI_CLASS::call_maxScan<unsigned long int>(
    const unsigned long int *send, unsigned long int *recv, int n ) const
{
    PROFILE_START( "maxScan<unsigned long>", profile_level );
    MPI_Scan( (void *) send, (void *) recv, n, MPI_UNSIGNED_LONG, MPI_MAX, communicator );
    PROFILE_STOP( "maxScan<unsigned long>", profile_level );
}
// size_t
#ifdef USE_WINDOWS
template<>
void MPI_CLASS::call_maxScan<size_t>( const size_t *send, size_t *recv, int n ) const
{
    MPI_ASSERT( MPI_SIZE_T != 0 );
    PROFILE_START( "maxScan<size_t>", profile_level );
    MPI_Scan( (void *) send, (void *) recv, n, MPI_SIZE_T, MPI_MAX, communicator );
    PROFILE_STOP( "maxScan<size_t>", profile_level );
}
#endif
// float
template<>
void MPI_CLASS::call_maxScan<float>( const float *send, float *recv, int n ) const
{
    PROFILE_START( "maxScan<float>", profile_level );
    MPI_Scan( (void *) send, (void *) recv, n, MPI_INT, MPI_MAX, communicator );
    PROFILE_STOP( "maxScan<float>", profile_level );
}
// double
template<>
void MPI_CLASS::call_maxScan<double>( const double *send, double *recv, int n ) const
{
    PROFILE_START( "maxScan<double>", profile_level );
    MPI_Scan( (void *) send, (void *) recv, n, MPI_DOUBLE, MPI_MAX, communicator );
    PROFILE_STOP( "maxScan<double>", profile_level );
}
#endif


/************************************************************************
 *  Communicate ranks for communication                                  *
 ************************************************************************/
std::vector<int> MPI_CLASS::commRanks( const std::vector<int> &ranks ) const
{
#ifdef USE_MPI
    // Get a byte array with the ranks to communicate
    auto data1 = new char[comm_size];
    auto data2 = new char[comm_size];
    memset( data1, 0, comm_size );
    memset( data2, 0, comm_size );
    for ( auto &rank : ranks )
        data1[rank] = 1;
    MPI_Alltoall( data1, 1, MPI_CHAR, data2, 1, MPI_CHAR, communicator );
    int N = 0;
    for ( int i = 0; i < comm_size; i++ )
        N += data2[i];
    std::vector<int> ranks_out;
    ranks_out.reserve( N );
    for ( int i = 0; i < comm_size; i++ ) {
        if ( data2[i] )
            ranks_out.push_back( i );
    }
    delete[] data1;
    delete[] data2;
    return ranks_out;
#else
    return ranks;
#endif
}


/************************************************************************
 *  Wait functions                                                       *
 ************************************************************************/
#ifdef USE_MPI
void MPI_CLASS::wait( MPI_Request request )
{
    PROFILE_START( "wait", profile_level );
    MPI_Status status;
    int flag = 0;
    int err  = MPI_Test( &request, &flag, &status );
    MPI_ASSERT( err == MPI_SUCCESS ); // Check that the first call is valid
    while ( !flag ) {
        // Put the current thread to sleep to allow other threads to run
        sched_yield();
        // Check if the request has finished
        MPI_Test( &request, &flag, &status );
    }
    PROFILE_STOP( "wait", profile_level );
}
int MPI_CLASS::waitAny( int count, MPI_Request *request )
{
    if ( count == 0 )
        return -1;
    PROFILE_START( "waitAny", profile_level );
    int index   = -1;
    int flag    = 0;
    auto status = new MPI_Status[count];
    int err     = MPI_Testany( count, request, &index, &flag, status );
    MPI_ASSERT( err == MPI_SUCCESS ); // Check that the first call is valid
    while ( !flag ) {
        // Put the current thread to sleep to allow other threads to run
        sched_yield();
        // Check if the request has finished
        MPI_Testany( count, request, &index, &flag, status );
    }
    MPI_ASSERT( index >= 0 ); // Check that the index is valid
    delete[] status;
    PROFILE_STOP( "waitAny", profile_level );
    return index;
}
void MPI_CLASS::waitAll( int count, MPI_Request *request )
{
    if ( count == 0 )
        return;
    PROFILE_START( "waitAll", profile_level );
    int flag    = 0;
    auto status = new MPI_Status[count];
    int err     = MPI_Testall( count, request, &flag, status );
    MPI_ASSERT( err == MPI_SUCCESS ); // Check that the first call is valid
    while ( !flag ) {
        // Put the current thread to sleep to allow other threads to run
        sched_yield();
        // Check if the request has finished
        MPI_Testall( count, request, &flag, status );
    }
    PROFILE_STOP( "waitAll", profile_level );
    delete[] status;
}
std::vector<int> MPI_CLASS::waitSome( int count, MPI_Request *request )
{
    if ( count == 0 )
        return std::vector<int>();
    PROFILE_START( "waitSome", profile_level );
    std::vector<int> indicies( count, -1 );
    auto *status = new MPI_Status[count];
    int outcount = 0;
    int err      = MPI_Testsome( count, request, &outcount, &indicies[0], status );
    MPI_ASSERT( err == MPI_SUCCESS );        // Check that the first call is valid
    MPI_ASSERT( outcount != MPI_UNDEFINED ); // Check that the first call is valid
    while ( outcount == 0 ) {
        // Put the current thread to sleep to allow other threads to run
        sched_yield();
        // Check if the request has finished
        MPI_Testsome( count, request, &outcount, &indicies[0], status );
    }
    indicies.resize( outcount );
    delete[] status;
    PROFILE_STOP( "waitSome", profile_level );
    return indicies;
}
#else
void MPI_CLASS::wait( MPI_Request request )
{
    PROFILE_START( "wait", profile_level );
    while ( 1 ) {
        // Check if the request is in our list
        if ( global_isendrecv_list.find( request ) == global_isendrecv_list.end() )
            break;
        // Put the current thread to sleep to allow other threads to run
        sched_yield();
    }
    PROFILE_STOP( "wait", profile_level );
}
int MPI_CLASS::waitAny( int count, MPI_Request *request )
{
    if ( count == 0 )
        return -1;
    PROFILE_START( "waitAny", profile_level );
    int index = 0;
    while ( 1 ) {
        // Check if the request is in our list
        bool found_any = false;
        for ( int i = 0; i < count; i++ ) {
            if ( global_isendrecv_list.find( request[i] ) == global_isendrecv_list.end() ) {
                found_any = true;
                index     = i;
            }
        }
        if ( found_any )
            break;
        // Put the current thread to sleep to allow other threads to run
        sched_yield();
    }
    PROFILE_STOP( "waitAny", profile_level );
    return index;
}
void MPI_CLASS::waitAll( int count, MPI_Request *request )
{
    if ( count == 0 )
        return;
    PROFILE_START( "waitAll", profile_level );
    while ( 1 ) {
        // Check if the request is in our list
        bool found_all = true;
        for ( int i = 0; i < count; i++ ) {
            if ( global_isendrecv_list.find( request[i] ) != global_isendrecv_list.end() )
                found_all = false;
        }
        if ( found_all )
            break;
        // Put the current thread to sleep to allow other threads to run
        sched_yield();
    }
    PROFILE_STOP( "waitAll", profile_level );
}
std::vector<int> MPI_CLASS::waitSome( int count, MPI_Request *request )
{
    if ( count == 0 )
        return std::vector<int>();
    PROFILE_START( "waitSome", profile_level );
    std::vector<int> indicies;
    while ( 1 ) {
        // Check if the request is in our list
        for ( int i = 0; i < count; i++ ) {
            if ( global_isendrecv_list.find( request[i] ) == global_isendrecv_list.end() )
                indicies.push_back( i );
        }
        if ( !indicies.empty() )
            break;
        // Put the current thread to sleep to allow other threads to run
        sched_yield();
    }
    PROFILE_STOP( "waitSome", profile_level );
    return indicies;
}
#endif


/************************************************************************
 *  Probe functions                                                      *
 ************************************************************************/
#ifdef USE_MPI
int MPI_CLASS::Iprobe( int source, int tag ) const
{
    MPI_INSIST( tag <= d_maxTag, "Maximum tag value exceeded" );
    MPI_INSIST( tag >= 0, "tag must be >= 0" );
    MPI_Status status;
    int flag = 0;
    MPI_Iprobe( source, tag, communicator, &flag, &status );
    if ( flag == 0 )
        return -1;
    int count;
    MPI_Get_count( &status, MPI_BYTE, &count );
    MPI_ASSERT( count >= 0 );
    return count;
}
int MPI_CLASS::probe( int source, int tag ) const
{
    MPI_INSIST( tag <= d_maxTag, "Maximum tag value exceeded" );
    MPI_INSIST( tag >= 0, "tag must be >= 0" );
    MPI_Status status;
    MPI_Probe( source, tag, communicator, &status );
    int count;
    MPI_Get_count( &status, MPI_BYTE, &count );
    MPI_ASSERT( count >= 0 );
    return count;
}
#else
int MPI_CLASS::Iprobe( int, int ) const
{
    MPI_ERROR( "Not implimented for serial codes (Iprobe)" );
    return 0;
}
int MPI_CLASS::probe( int, int ) const
{
    MPI_ERROR( "Not implimented for serial codes (probe)" );
    return 0;
}
#endif


/************************************************************************
 *  Timer functions                                                      *
 ************************************************************************/
#ifdef USE_MPI
double MPI_CLASS::time() { return MPI_Wtime(); }
double MPI_CLASS::tick() { return MPI_Wtick(); }
#else
double MPI_CLASS::time()
{
    auto t  = std::chrono::system_clock::now();
    auto ns = std::chrono::duration_cast<std::chrono::nanoseconds>( t.time_since_epoch() );
    return 1e-9 * ns.count();
}
double MPI_CLASS::tick()
{
    auto period = std::chrono::system_clock::period();
    return static_cast<double>( period.num ) / static_cast<double>( period.den );
}
#endif


/************************************************************************
 *  Serialize a block of code across MPI processes                       *
 ************************************************************************/
void MPI_CLASS::serializeStart()
{
#ifdef USE_MPI
    using namespace std::chrono_literals;
    if ( comm_rank == 0 ) {
        // Start rank 0 immediately
    } else {
        // Wait for a message from the previous rank
        MPI_Request request;
        MPI_Status status;
        int flag = false, buf = 0;
        MPI_Irecv( &buf, 1, MPI_INT, comm_rank - 1, 5627, MPI_COMM_WORLD, &request );
        while ( !flag ) {
            MPI_Test( &request, &flag, &status );
            std::this_thread::sleep_for( 50ms );
        }
    }
#endif
}
void MPI_CLASS::serializeStop()
{
#ifdef USE_MPI
    using namespace std::chrono_literals;
    if ( comm_rank < comm_size - 1 ) {
        // Send flag to next rank
        MPI_Send( &comm_rank, 1, MPI_INT, comm_rank + 1, 5627, MPI_COMM_WORLD );
        // Wait for final finished flag
        int flag = false, buf = 0;
        MPI_Request request;
        MPI_Status status;
        MPI_Irecv( &buf, 1, MPI_INT, comm_size - 1, 5627, MPI_COMM_WORLD, &request );
        while ( !flag ) {
            MPI_Test( &request, &flag, &status );
            std::this_thread::sleep_for( 50ms );
        }
    } else {
        // Send final flag to all ranks
        for ( int i = 0; i < comm_size - 1; i++ )
            MPI_Send( &comm_rank, 1, MPI_INT, i, 5627, MPI_COMM_WORLD );
    }
#endif
}


/****************************************************************************
 * Function to start/stop MPI                                                *
 ****************************************************************************/
#ifdef USE_EXT_MPI
static bool called_MPI_Init = false;
#endif
bool MPI_CLASS::MPI_Active()
{
#ifdef USE_EXT_MPI
    int MPI_initialized, MPI_finialized;
    MPI_Initialized( &MPI_initialized );
    MPI_Finalized( &MPI_finialized );
    return MPI_initialized != 0 && MPI_finialized == 0;
#else
    return false;
#endif
}
void MPI_CLASS::start_MPI( int argc, char *argv[], int profile_level )
{
    changeProfileLevel( profile_level );
    NULL_USE( argc );
    NULL_USE( argv );
#ifdef USE_EXT_MPI
    if ( MPI_Active() ) {
        called_MPI_Init = false;
    } else {
        int provided;
        int result = MPI_Init_thread( &argc, &argv, MPI_THREAD_MULTIPLE, &provided );
        if ( result != MPI_SUCCESS )
            MPI_ERROR( "Unable to initialize MPI" );
        if ( provided < MPI_THREAD_MULTIPLE )
            std::cerr << "Warning: Failed to start MPI with MPI_THREAD_MULTIPLE\n";
        called_MPI_Init = true;
    }
#endif
}
void MPI_CLASS::stop_MPI()
{
#ifdef USE_EXT_MPI
    int finalized;
    MPI_Finalized( &finalized );
    if ( called_MPI_Init && !finalized ) {
        MPI_Barrier( MPI_COMM_WORLD );
        MPI_Finalize();
        called_MPI_Init = true;
    }
#endif
}


} // namespace Utilities