LBPM/common/MPI.I

// This file contains the default instantiations for templated operations
// Note: Intel compilers need definitions before all default instantions to compile correctly
#ifndef included_MPI_I
#define included_MPI_I

#include "common/Utilities.h"

#include <typeinfo>


#define MPI_CLASS MPI
#define MPI_CLASS_ERROR ERROR
#define MPI_CLASS_ASSERT ASSERT

#undef NULL_USE
#define NULL_USE( variable )                    \
    do {                                        \
        if ( 0 ) {                              \
            auto static t = (char *) &variable; \
            t++;                                \
        }                                       \
    } while ( 0 )


namespace Utilities {


// Function to test if a type is a std::pair
template<typename>
struct is_pair : std::false_type {
};
template<typename T, typename U>
struct is_pair<std::pair<T, U>> : std::true_type {
};


// Function to test if a type can be passed by MPI
template<class TYPE>
constexpr typename std::enable_if<std::is_trivially_copyable<TYPE>::value,bool>::type
    is_mpi_copyable()
{
    return true;
}
template<class TYPE>
constexpr typename std::enable_if<!std::is_trivially_copyable<TYPE>::value&&is_pair<TYPE>::value,bool>::type
    is_mpi_copyable()
{
    return is_mpi_copyable<typename TYPE::first_type>() && is_mpi_copyable<typename TYPE::second_type>();
}
template<class TYPE>
constexpr typename std::enable_if<!std::is_trivially_copyable<TYPE>::value&&!is_pair<TYPE>::value,bool>::type
    is_mpi_copyable()
{
    return false;
}


/************************************************************************
 *  sumReduce                                                            *
 ************************************************************************/
template<class TYPE>
inline TYPE MPI_CLASS::sumReduce( const TYPE value ) const
{
    if ( comm_size > 1 ) {
        TYPE tmp = value;
        call_sumReduce( &tmp, 1 );
        return tmp;
    } else {
        return value;
    }
}
template<class TYPE>
inline void MPI_CLASS::sumReduce( TYPE *x, const int n ) const
{
    if ( comm_size > 1 )
        call_sumReduce( x, n );
}
template<class TYPE>
inline void MPI_CLASS::sumReduce( const TYPE *x, TYPE *y, const int n ) const
{
    if ( comm_size > 1 ) {
        call_sumReduce( x, y, n );
    } else {
        for ( int i = 0; i < n; i++ )
            y[i] = x[i];
    }
}
// Define specializations of call_sumReduce(TYPE*, const int)
#if defined( USE_MPI ) || defined( USE_EXT_MPI )
template<>
void MPI_CLASS::call_sumReduce<unsigned char>( unsigned char *, const int ) const;
template<>
void MPI_CLASS::call_sumReduce<char>( char *, const int ) const;
template<>
void MPI_CLASS::call_sumReduce<unsigned int>( unsigned int *, const int ) const;
template<>
void MPI_CLASS::call_sumReduce<int>( int *, const int ) const;
template<>
void MPI_CLASS::call_sumReduce<unsigned long int>( unsigned long int *, const int ) const;
template<>
void MPI_CLASS::call_sumReduce<long int>( long int *, const int ) const;
template<>
void MPI_CLASS::call_sumReduce<size_t>( size_t *, const int ) const;
template<>
void MPI_CLASS::call_sumReduce<float>( float *, const int ) const;
template<>
void MPI_CLASS::call_sumReduce<double>( double *, const int ) const;
template<>
void MPI_CLASS::call_sumReduce<std::complex<double>>( std::complex<double> *, const int ) const;
#endif
// Default instantiations of call_sumReduce(TYPE*, const int)
template<class TYPE>
void MPI_CLASS::call_sumReduce( TYPE *, const int ) const
{
    char message[200];
    sprintf( message, "Default instantion of sumReduce in parallel is not supported (%s)",
        typeid( TYPE ).name() );
    MPI_CLASS_ERROR( message );
}
// Define specializations of call_sumReduce(const TYPE*, TYPE*, const int)
#if defined( USE_MPI ) || defined( USE_EXT_MPI )
template<>
void MPI_CLASS::call_sumReduce<unsigned char>(
    const unsigned char *, unsigned char *, const int ) const;
template<>
void MPI_CLASS::call_sumReduce<char>( const char *, char *, const int ) const;
template<>
void MPI_CLASS::call_sumReduce<unsigned int>(
    const unsigned int *, unsigned int *, const int ) const;
template<>
void MPI_CLASS::call_sumReduce<int>( const int *, int *, const int ) const;
template<>
void MPI_CLASS::call_sumReduce<unsigned long int>(
    const unsigned long int *, unsigned long int *, const int ) const;
template<>
void MPI_CLASS::call_sumReduce<long int>( const long int *, long int *, const int ) const;
template<>
void MPI_CLASS::call_sumReduce<size_t>( const size_t *, size_t *, const int ) const;
template<>
void MPI_CLASS::call_sumReduce<float>( const float *, float *, const int ) const;
template<>
void MPI_CLASS::call_sumReduce<double>( const double *, double *, const int ) const;
template<>
void MPI_CLASS::call_sumReduce<std::complex<double>>(
    const std::complex<double> *, std::complex<double> *, const int ) const;
#endif
// Default instantiations of call_sumReduce(const TYPE*, TYPE*, const int)
template<class TYPE>
void MPI_CLASS::call_sumReduce( const TYPE *x, TYPE *y, const int n ) const
{
    NULL_USE( x );
    NULL_USE( y );
    NULL_USE( n );
    char message[200];
    sprintf( message, "Default instantion of sumReduce in parallel is not supported (%s)",
        typeid( TYPE ).name() );
    MPI_CLASS_ERROR( message );
}


/************************************************************************
 *  minReduce                                                            *
 ************************************************************************/
template<class TYPE>
inline TYPE MPI_CLASS::minReduce( const TYPE value ) const
{
    if ( comm_size > 1 ) {
        TYPE tmp = value;
        call_minReduce( &tmp, 1, nullptr );
        return tmp;
    } else {
        return value;
    }
}
template<class TYPE>
inline void MPI_CLASS::minReduce( TYPE *x, const int n, int *rank_of_min ) const
{
    if ( comm_size > 1 ) {
        call_minReduce( x, n, rank_of_min );
    } else {
        if ( rank_of_min != nullptr ) {
            for ( int i = 0; i < n; i++ )
                rank_of_min[i] = 0;
        }
    }
}
template<class TYPE>
inline void MPI_CLASS::minReduce( const TYPE *x, TYPE *y, const int n, int *rank_of_min ) const
{
    if ( comm_size > 1 ) {
        call_minReduce( x, y, n, rank_of_min );
    } else {
        for ( int i = 0; i < n; i++ ) {
            y[i] = x[i];
            if ( rank_of_min != nullptr )
                rank_of_min[i] = 0;
        }
    }
}
// Define specializations of call_minReduce(TYPE*, const int, int*)
#if defined( USE_MPI ) || defined( USE_EXT_MPI )
template<>
void MPI_CLASS::call_minReduce<unsigned char>( unsigned char *, const int, int * ) const;
template<>
void MPI_CLASS::call_minReduce<char>( char *, const int, int * ) const;
template<>
void MPI_CLASS::call_minReduce<unsigned int>( unsigned int *, const int, int * ) const;
template<>
void MPI_CLASS::call_minReduce<int>( int *, const int, int * ) const;
template<>
void MPI_CLASS::call_minReduce<unsigned long int>( unsigned long int *, const int, int * ) const;
template<>
void MPI_CLASS::call_minReduce<long int>( long int *, const int, int * ) const;
template<>
void MPI_CLASS::call_minReduce<unsigned long long int>(
    unsigned long long int *, const int, int * ) const;
template<>
void MPI_CLASS::call_minReduce<long long int>( long long int *, const int, int * ) const;
template<>
void MPI_CLASS::call_minReduce<size_t>( size_t *, const int, int * ) const;
template<>
void MPI_CLASS::call_minReduce<float>( float *, const int, int * ) const;
template<>
void MPI_CLASS::call_minReduce<double>( double *, const int, int * ) const;
#endif
// Default instantiations of call_minReduce(TYPE*, const int, int*)
template<class TYPE>
void MPI_CLASS::call_minReduce( TYPE *, const int, int * ) const
{
    char message[200];
    sprintf( message, "Default instantion of minReduce in parallel is not supported (%s)",
        typeid( TYPE ).name() );
    MPI_CLASS_ERROR( message );
}
// Define specializations of call_minReduce(const TYPE*, TYPE*, const int, int*)
#if defined( USE_MPI ) || defined( USE_EXT_MPI )
template<>
void MPI_CLASS::call_minReduce<unsigned char>(
    const unsigned char *, unsigned char *, const int, int * ) const;
template<>
void MPI_CLASS::call_minReduce<char>( const char *, char *, const int, int * ) const;
template<>
void MPI_CLASS::call_minReduce<unsigned int>(
    const unsigned int *, unsigned int *, const int, int * ) const;
template<>
void MPI_CLASS::call_minReduce<int>( const int *, int *, const int, int * ) const;
template<>
void MPI_CLASS::call_minReduce<unsigned long int>(
    const unsigned long int *, unsigned long int *, const int, int * ) const;
template<>
void MPI_CLASS::call_minReduce<long int>( const long int *, long int *, const int, int * ) const;
template<>
void MPI_CLASS::call_minReduce<unsigned long long int>(
    const unsigned long long int *, unsigned long long int *, const int, int * ) const;
template<>
void MPI_CLASS::call_minReduce<long long int>(
    const long long int *, long long int *, const int, int * ) const;
template<>
void MPI_CLASS::call_minReduce<size_t>( const size_t *, size_t *, const int, int * ) const;
template<>
void MPI_CLASS::call_minReduce<float>( const float *, float *, const int, int * ) const;
template<>
void MPI_CLASS::call_minReduce<double>( const double *, double *, const int, int * ) const;
#endif
// Default instantiations of call_minReduce(const TYPE*, TYPE*, const int, int*)
template<class TYPE>
void MPI_CLASS::call_minReduce( const TYPE *, TYPE *, const int, int * ) const
{
    char message[200];
    sprintf( message, "Default instantion of minReduce in parallel is not supported (%s)",
        typeid( TYPE ).name() );
    MPI_CLASS_ERROR( message );
}


/************************************************************************
 *  maxReduce                                                         *
 ************************************************************************/
template<class TYPE>
inline TYPE MPI_CLASS::maxReduce( const TYPE value ) const
{
    if ( comm_size > 1 ) {
        TYPE tmp = value;
        call_maxReduce( &tmp, 1, nullptr );
        return tmp;
    } else {
        return value;
    }
}
template<class TYPE>
inline void MPI_CLASS::maxReduce( TYPE *x, const int n, int *rank_of_max ) const
{
    if ( comm_size > 1 ) {
        call_maxReduce( x, n, rank_of_max );
    } else {
        if ( rank_of_max != nullptr ) {
            for ( int i = 0; i < n; i++ )
                rank_of_max[i] = 0;
        }
    }
}
template<class TYPE>
inline void MPI_CLASS::maxReduce( const TYPE *x, TYPE *y, const int n, int *rank_of_max ) const
{
    if ( comm_size > 1 ) {
        call_maxReduce( x, y, n, rank_of_max );
    } else {
        for ( int i = 0; i < n; i++ ) {
            y[i] = x[i];
            if ( rank_of_max != nullptr )
                rank_of_max[i] = 0;
        }
    }
}
// Define specializations of call_maxReduce(TYPE*, const int, int*)
#if defined( USE_MPI ) || defined( USE_EXT_MPI )
template<>
void MPI_CLASS::call_maxReduce<unsigned char>( unsigned char *, const int, int * ) const;
template<>
void MPI_CLASS::call_maxReduce<char>( char *, const int, int * ) const;
template<>
void MPI_CLASS::call_maxReduce<unsigned int>( unsigned int *, const int, int * ) const;
template<>
void MPI_CLASS::call_maxReduce<int>( int *, const int, int * ) const;
template<>
void MPI_CLASS::call_maxReduce<unsigned long int>( unsigned long int *, const int, int * ) const;
template<>
void MPI_CLASS::call_maxReduce<long int>( long int *, const int, int * ) const;
template<>
void MPI_CLASS::call_maxReduce<unsigned long long int>(
    unsigned long long int *, const int, int * ) const;
template<>
void MPI_CLASS::call_maxReduce<long long int>( long long int *, const int, int * ) const;
template<>
void MPI_CLASS::call_maxReduce<size_t>( size_t *, const int, int * ) const;
template<>
void MPI_CLASS::call_maxReduce<float>( float *, const int, int * ) const;
template<>
void MPI_CLASS::call_maxReduce<double>( double *, const int, int * ) const;
#endif
// Default instantiations of call_maxReduce(TYPE*, const int, int*)
template<class TYPE>
void MPI_CLASS::call_maxReduce( TYPE *, const int, int * ) const
{
    char message[200];
    sprintf( message, "Default instantion of maxReduce in parallel is not supported (%s)",
        typeid( TYPE ).name() );
    MPI_CLASS_ERROR( message );
}
// Define specializations of call_maxReduce(const TYPE*, TYPE*, const int, int*)
#if defined( USE_MPI ) || defined( USE_EXT_MPI )
template<>
void MPI_CLASS::call_maxReduce<unsigned char>(
    const unsigned char *, unsigned char *, const int, int * ) const;
template<>
void MPI_CLASS::call_maxReduce<char>( const char *, char *, const int, int * ) const;
template<>
void MPI_CLASS::call_maxReduce<unsigned int>(
    const unsigned int *, unsigned int *, const int, int * ) const;
template<>
void MPI_CLASS::call_maxReduce<int>( const int *, int *, const int, int * ) const;
template<>
void MPI_CLASS::call_maxReduce<unsigned long int>(
    const unsigned long int *, unsigned long int *, const int, int * ) const;
template<>
void MPI_CLASS::call_maxReduce<long int>( const long int *, long int *, const int, int * ) const;
template<>
void MPI_CLASS::call_maxReduce<unsigned long long int>(
    const unsigned long long int *, unsigned long long int *, const int, int * ) const;
template<>
void MPI_CLASS::call_maxReduce<long long int>(
    const long long int *, long long int *, const int, int * ) const;
template<>
void MPI_CLASS::call_maxReduce<size_t>( const size_t *, size_t *, const int, int * ) const;
template<>
void MPI_CLASS::call_maxReduce<float>( const float *, float *, const int, int * ) const;
template<>
void MPI_CLASS::call_maxReduce<double>( const double *, double *, const int, int * ) const;
#endif
// Default instantiations of call_maxReduce(const TYPE*, TYPE*, const int, int*)
template<class TYPE>
void MPI_CLASS::call_maxReduce( const TYPE *, TYPE *, const int, int * ) const
{
    char message[200];
    sprintf( message, "Default instantion of maxReduce in parallel is not supported (%s)",
        typeid( TYPE ).name() );
    MPI_CLASS_ERROR( message );
}


/************************************************************************
 *  bcast                                                                *
 ************************************************************************/
// Define specializations of bcast(TYPE*, const int, const int)
#if defined( USE_MPI ) || defined( USE_EXT_MPI )
template<>
void MPI_CLASS::call_bcast<unsigned char>( unsigned char *, const int, const int ) const;
template<>
void MPI_CLASS::call_bcast<char>( char *, const int, const int ) const;
template<>
void MPI_CLASS::call_bcast<unsigned int>( unsigned int *, const int, const int ) const;
template<>
void MPI_CLASS::call_bcast<int>( int *, const int, const int ) const;
template<>
void MPI_CLASS::call_bcast<float>( float *, const int, const int ) const;
template<>
void MPI_CLASS::call_bcast<double>( double *, const int, const int ) const;
#else
template<>
void MPI_CLASS::call_bcast<char>( char *, const int, const int ) const;
#endif
// Default instantiations of bcast(TYPE*, const int, const int)
template<class TYPE>
void MPI_CLASS::call_bcast( TYPE *x, const int n, const int root ) const
{
    static_assert( is_mpi_copyable<TYPE>(), "Object is not trivially copyable" );
    call_bcast<char>( (char *) x, (int) n * sizeof( TYPE ), root );
}
// Specialization of bcast for std::string
template<>
inline std::string MPI_CLASS::bcast<std::string>( const std::string &value, const int root ) const
{
    if ( comm_size == 1 )
        return value;
    int length = static_cast<int>( value.size() );
    call_bcast<int>( &length, 1, root );
    if ( length == 0 )
        return std::string();
    char *str = new char[length + 1];
    if ( root == comm_rank ) {
        for ( int i = 0; i < length; i++ )
            str[i] = value[i];
    }
    call_bcast<char>( str, length, root );
    str[length] = 0;
    std::string result( str );
    delete[] str;
    return result;
}
template<>
inline void MPI_CLASS::bcast<std::string>( std::string *, const int, const int ) const
{
    MPI_CLASS_ERROR( "Cannot bcast an array of strings" );
}
// Default implimentation of bcast
template<class TYPE>
inline TYPE MPI_CLASS::bcast( const TYPE &value, const int root ) const
{
    if ( root >= comm_size )
        MPI_CLASS_ERROR( "root cannot be >= size in bcast" );
    if ( comm_size > 1 ) {
        TYPE tmp = value;
        call_bcast( &tmp, 1, root );
        return tmp;
    } else {
        return value;
    }
}
template<class TYPE>
inline void MPI_CLASS::bcast( TYPE *x, const int n, const int root ) const
{
    if ( root >= comm_size )
        MPI_CLASS_ERROR( "root cannot be >= size in bcast" );
    if ( comm_size > 1 )
        call_bcast( x, n, root );
}


/************************************************************************
 *  send                                                                 *
 ************************************************************************/
// Define specializations of send(const TYPE*, const int, const int, int)
#if defined( USE_MPI ) || defined( USE_EXT_MPI )
template<>
void MPI_CLASS::send<char>( const char *, const int, const int, int ) const;
template<>
void MPI_CLASS::send<int>( const int *, int, const int, int ) const;
template<>
void MPI_CLASS::send<float>( const float *, const int, const int, int ) const;
template<>
void MPI_CLASS::send<double>( const double *, const int, const int, int ) const;
#else
template<>
void MPI_CLASS::send<char>( const char *, const int, const int, int ) const;
#endif
// Default instantiations of send(const TYPE*, const int, const int, int)
template<class TYPE>
inline void MPI_CLASS::send(
    const TYPE *buf, const int length, const int recv_proc_number, int tag ) const
{
    static_assert( is_mpi_copyable<TYPE>(), "Object is not trivially copyable" );
    send<char>( (const char *) buf, length * sizeof( TYPE ), recv_proc_number, tag );
}


/************************************************************************
 *  Isend                                                                *
 ************************************************************************/
// Define specializations of Isend(const TYPE*, const int, const int, const int)
#if defined( USE_MPI ) || defined( USE_EXT_MPI )
template<>
MPI_Request MPI_CLASS::Isend<char>( const char *, const int, const int, const int ) const;
template<>
MPI_Request MPI_CLASS::Isend<int>( const int *, int, const int, const int ) const;
template<>
MPI_Request MPI_CLASS::Isend<float>( const float *, const int, const int, const int ) const;
template<>
MPI_Request MPI_CLASS::Isend<double>( const double *, const int, const int, const int ) const;
#else
template<>
MPI_Request MPI_CLASS::Isend<char>( const char *, const int, const int, const int ) const;
#endif
// Default instantiations of Isend(const TYPE*, const int, const int, const int)
template<class TYPE>
inline MPI_Request MPI_CLASS::Isend(
    const TYPE *buf, const int length, const int recv_proc_number, const int tag ) const
{
    static_assert( is_mpi_copyable<TYPE>(), "Object is not trivially copyable" );
    return Isend<char>( (const char *) buf, length * sizeof( TYPE ), recv_proc_number, tag );
}


/************************************************************************
 *  recv                                                                 *
 ************************************************************************/
// Define specializations of recv(TYPE*, int&, const int, const bool, int)
#if defined( USE_MPI ) || defined( USE_EXT_MPI )
template<>
void MPI_CLASS::recv<char>( char *, int &, const int, const bool, int ) const;
template<>
void MPI_CLASS::recv<int>( int *, int &, const int, const bool, int ) const;
template<>
void MPI_CLASS::recv<float>( float *, int &, const int, const bool, int ) const;
template<>
void MPI_CLASS::recv<double>( double *, int &, const int, const bool, int ) const;
#else
template<>
void MPI_CLASS::recv<char>( char *, int &, const int, const bool, int ) const;
#endif
// Default instantiations of recv(TYPE*, int&, const int, const bool, int)
template<class TYPE>
inline void MPI_CLASS::recv(
    TYPE *buf, int &length, const int send_proc_number, const bool get_length, int tag ) const
{
    static_assert( is_mpi_copyable<TYPE>(), "Object is not trivially copyable" );
    int size = length * sizeof( TYPE );
    recv<char>( (char *) buf, size, send_proc_number, get_length, tag );
    if ( get_length ) {
        MPI_CLASS_ASSERT( size % sizeof( TYPE ) == 0 );
        length = size / sizeof( TYPE );
    }
}


/************************************************************************
 *  Irecv                                                                *
 ************************************************************************/
// Define specializations of recv(TYPE*, int&, const int, const bool, int)
#if defined( USE_MPI ) || defined( USE_EXT_MPI )
template<>
MPI_Request MPI_CLASS::Irecv<char>( char *, const int, const int, const int ) const;
template<>
MPI_Request MPI_CLASS::Irecv<int>( int *, const int, const int, const int ) const;
template<>
MPI_Request MPI_CLASS::Irecv<float>( float *, const int, const int, const int ) const;
template<>
MPI_Request MPI_CLASS::Irecv<double>( double *, const int, const int, const int ) const;
#else
template<>
MPI_Request MPI_CLASS::Irecv<char>( char *, const int, const int, const int ) const;
#endif
// Default instantiations of recv(TYPE*, int&, const int, const bool, int)
template<class TYPE>
inline MPI_Request MPI_CLASS::Irecv(
    TYPE *buf, const int length, const int send_proc, const int tag ) const
{
    static_assert( is_mpi_copyable<TYPE>(), "Object is not trivially copyable" );
    return Irecv<char>( (char *) buf, length * sizeof( TYPE ), send_proc, tag );
}


/************************************************************************
 *  sendrecv                                                                 *
 ************************************************************************/
#if defined( USE_MPI ) || defined( USE_EXT_MPI )
template<>
void MPI_CLASS::sendrecv<char>( const char*, int, int, int, char*, int, int, int ) const;
template<>
void MPI_CLASS::sendrecv<int>( const int*, int, int, int, int*, int, int, int ) const;
template<>
void MPI_CLASS::sendrecv<float>( const float*, int, int, int, float*, int, int, int ) const;
template<>
void MPI_CLASS::sendrecv<double>( const double*, int, int, int, double*, int, int, int ) const;
template<class TYPE>
void MPI_CLASS::sendrecv( const TYPE *sendbuf, int sendcount, int dest, int sendtag,
                          TYPE *recvbuf, int recvcount, int source, int recvtag ) const
{
    if ( getSize() == 1 ) {
        ASSERT( dest == 0 );
        ASSERT( source == 0 );
        ASSERT( sendcount == recvcount );
        ASSERT( sendtag == recvtag );
        memcpy( recvbuf, sendbuf, sendcount * sizeof( TYPE ) );
    } else {
        ERROR( "Not implimented for " + std::string( typeid( TYPE ).name() ) );
    }
}
#else
template<class TYPE>
void MPI_CLASS::sendrecv( const TYPE *sendbuf, int sendcount, int dest, int sendtag,
                          TYPE *recvbuf, int recvcount, int source, int recvtag ) const
{
    ASSERT( dest == 0 );
    ASSERT( source == 0 );
    ASSERT( sendcount == recvcount );
    ASSERT( sendtag == recvtag );
    memcpy( recvbuf, sendbuf, sendcount * sizeof( TYPE ) );
}
#endif


/************************************************************************
 *  allGather                                                            *
 ************************************************************************/
template<class TYPE>
std::vector<TYPE> MPI_CLASS::allGather( const TYPE &x ) const
{
    static_assert( is_mpi_copyable<TYPE>(), "Object is not trivially copyable" );
    if ( getSize() <= 1 )
        return std::vector<TYPE>( 1, x );
    std::vector<TYPE> data( getSize() );
    allGather( x, data.data() );
    return data;
}
template<class TYPE>
std::vector<TYPE> MPI_CLASS::allGather( const std::vector<TYPE> &x ) const
{
    static_assert( is_mpi_copyable<TYPE>(), "Object is not trivially copyable" );
    if ( getSize() <= 1 )
        return x;
    std::vector<int> count = allGather<int>( x.size() );
    std::vector<int> disp( getSize(), 0 );
    size_t N = count[0];
    for ( size_t i = 1; i < count.size(); i++ ) {
        disp[i] = disp[i - 1] + count[i - 1];
        N += count[i];
    }
    std::vector<TYPE> data( N );
    allGather<TYPE>( x.data(), x.size(), data.data(), count.data(), disp.data(), true );
    return data;
}
// Specialization of MPI_CLASS::allGather for std::string
template<>
inline void MPI_CLASS::allGather<std::string>( const std::string &x_in, std::string *x_out ) const
{
    // Get the bytes recvied per processor
    std::vector<int> recv_cnt( comm_size, 0 );
    allGather<int>( (int) x_in.size() + 1, &recv_cnt[0] );
    std::vector<int> recv_disp( comm_size, 0 );
    for ( int i = 1; i < comm_size; i++ )
        recv_disp[i] = recv_disp[i - 1] + recv_cnt[i - 1];
    // Call the vector form of allGather for the char arrays
    char *recv_data = new char[recv_disp[comm_size - 1] + recv_cnt[comm_size - 1]];
    allGather<char>(
        x_in.c_str(), (int) x_in.size() + 1, recv_data, &recv_cnt[0], &recv_disp[0], true );
    for ( int i = 0; i < comm_size; i++ )
        x_out[i] = std::string( &recv_data[recv_disp[i]] );
    delete[] recv_data;
}
// Default instantiation of MPI_CLASS::allGather
template<class TYPE>
inline void MPI_CLASS::allGather( const TYPE &x_in, TYPE *x_out ) const
{
    static_assert( is_mpi_copyable<TYPE>(), "Object is not trivially copyable" );
    if ( comm_size > 1 ) {
        // We can use the vector form of allGather with a char array to ge the data we want
        call_allGather( x_in, x_out );
    } else {
        // Single processor case
        x_out[0] = x_in;
    }
}
// Specialization of MPI_CLASS::allGather for std::string
template<>
inline int MPI_CLASS::allGather<std::string>(
    const std::string *, const int, std::string *, int *, int *, bool ) const
{
    MPI_CLASS_ERROR( "Cannot allGather an array of strings" );
    return 0;
}
// Define specializations of call_allGather(const TYPE, TYPE*)
#if defined( USE_MPI ) || defined( USE_EXT_MPI )
template<>
void MPI_CLASS::call_allGather<unsigned char>( const unsigned char &, unsigned char * ) const;
template<>
void MPI_CLASS::call_allGather<char>( const char &, char * ) const;
template<>
void MPI_CLASS::call_allGather<unsigned int>( const unsigned int &, unsigned int * ) const;
template<>
void MPI_CLASS::call_allGather<int>( const int &, int * ) const;
template<>
void MPI_CLASS::call_allGather<unsigned long int>(
    const unsigned long int &, unsigned long int * ) const;
template<>
void MPI_CLASS::call_allGather<long int>( const long int &, long int * ) const;
template<>
void MPI_CLASS::call_allGather<float>( const float &, float * ) const;
template<>
void MPI_CLASS::call_allGather<double>( const double &, double * ) const;
#endif
// Default instantiation of MPI_CLASS::allGather
template<class TYPE>
int MPI_CLASS::allGather( const TYPE *send_data, const int send_cnt, TYPE *recv_data, int *recv_cnt,
    int *recv_disp, bool known_recv ) const
{
    // Check the inputs
    if ( known_recv && ( recv_cnt == nullptr || recv_disp == nullptr ) )
        MPI_CLASS_ERROR( "Error calling allGather" );
    // Check if we are dealing with a single processor
    if ( comm_size == 1 ) {
        if ( send_data == nullptr && send_cnt > 0 ) {
            MPI_CLASS_ERROR( "send_data is null" );
        } else if ( !known_recv ) {
            // We do not know the recieved sizes
            for ( int i = 0; i < send_cnt; i++ )
                recv_data[i] = send_data[i];
            if ( recv_cnt != nullptr )
                recv_cnt[0] = send_cnt;
            if ( recv_disp != nullptr )
                recv_disp[0] = 0;
        } else {
            // We know the recieved sizes
            for ( int i = 0; i < send_cnt; i++ )
                recv_data[i + recv_disp[0]] = send_data[i];
        }
        return send_cnt;
    }
    // Get the sizes of the recieved data (if necessary)
    int *recv_cnt2  = recv_cnt;
    int *recv_disp2 = recv_disp;
    if ( !known_recv ) {
        if ( recv_cnt == nullptr )
            recv_cnt2 = new int[comm_size];
        if ( recv_disp == nullptr )
            recv_disp2 = new int[comm_size];
        call_allGather( send_cnt, recv_cnt2 );
        recv_disp2[0] = 0;
        for ( int i = 1; i < comm_size; i++ )
            recv_disp2[i] = recv_disp2[i - 1] + recv_cnt2[i - 1];
    }
    int N_recv = 0;
    for ( int i = 0; i < comm_size; i++ )
        N_recv += recv_cnt2[i];
    // Send/recv the data
    call_allGather( send_data, send_cnt, recv_data, recv_cnt2, recv_disp2 );
    // Delete any temporary memory
    if ( recv_cnt == nullptr )
        delete[] recv_cnt2;
    if ( recv_disp == nullptr )
        delete[] recv_disp2;
    return N_recv;
}
// Default instantiations of call_allGather(const TYPE, TYPE*)
template<class TYPE>
void MPI_CLASS::call_allGather( const TYPE &x_in, TYPE *x_out ) const
{
    static_assert( is_mpi_copyable<TYPE>(), "Object is not trivially copyable" );
    allGather<char>( (const char *) &x_in, (int) sizeof( TYPE ), (char *) x_out );
}
// Define specializations of call_allGather(const TYPE*, int, TYPE*, int*, int*)
#if defined( USE_MPI ) || defined( USE_EXT_MPI )
template<>
void MPI_CLASS::call_allGather<unsigned char>(
    const unsigned char *, int, unsigned char *, int *, int * ) const;
template<>
void MPI_CLASS::call_allGather<char>( const char *, int, char *, int *, int * ) const;
template<>
void MPI_CLASS::call_allGather<unsigned int>(
    const unsigned int *, int, unsigned int *, int *, int * ) const;
template<>
void MPI_CLASS::call_allGather<int>( const int *, int, int *, int *, int * ) const;
template<>
void MPI_CLASS::call_allGather<unsigned long int>(
    const unsigned long int *, int, unsigned long int *, int *, int * ) const;
template<>
void MPI_CLASS::call_allGather<long int>( const long int *, int, long int *, int *, int * ) const;
template<>
void MPI_CLASS::call_allGather<float>( const float *, int, float *, int *, int * ) const;
template<>
void MPI_CLASS::call_allGather<double>( const double *, int, double *, int *, int * ) const;
#else
template<>
void MPI_CLASS::call_allGather<char>( const char *, int, char *, int *, int * ) const;
#endif
// Default instantiations of int call_allGather(const TYPE*, int, TYPE*, int*)
template<class TYPE>
void MPI_CLASS::call_allGather(
    const TYPE *x_in, int size_in, TYPE *x_out, int *size_out, int *disp_out ) const
{
    int *size2 = new int[comm_size];
    int *disp2 = new int[comm_size];
    for ( int i = 0; i < comm_size; i++ ) {
        size2[i] = size_out[i] * sizeof( TYPE );
        disp2[i] = disp_out[i] * sizeof( TYPE );
    }
    static_assert( is_mpi_copyable<TYPE>(), "Object is not trivially copyable" );
    call_allGather<char>(
        (const char *) x_in, (int) size_in * sizeof( TYPE ), (char *) x_out, size2, disp2 );
    delete[] size2;
    delete[] disp2;
}


/************************************************************************
 *  setGather                                                            *
 ************************************************************************/
template<class TYPE>
inline void MPI_CLASS::setGather( std::set<TYPE> &set ) const
{
    std::vector<TYPE> send_buf( set.begin(), set.end() );
    std::vector<int> recv_cnt( this->comm_size, 0 );
    this->allGather<int>( (int) send_buf.size(), &recv_cnt[0] );
    std::vector<int> recv_disp( this->comm_size, 0 );
    for ( int i = 1; i < this->comm_size; i++ )
        recv_disp[i] = recv_disp[i - 1] + recv_cnt[i - 1];
    size_t N_recv_tot = 0;
    for ( int i = 0; i < this->comm_size; i++ )
        N_recv_tot += recv_cnt[i];
    if ( N_recv_tot == 0 )
        return;
    std::vector<TYPE> recv_buf( N_recv_tot );
    TYPE *send_data = nullptr;
    if ( send_buf.size() > 0 ) {
        send_data = &send_buf[0];
    }
    TYPE *recv_data = &recv_buf[0];
    static_assert( is_mpi_copyable<TYPE>(), "Object is not trivially copyable" );
    this->allGather<TYPE>(
        send_data, (int) send_buf.size(), recv_data, &recv_cnt[0], &recv_disp[0], true );
    for ( size_t i = 0; i < recv_buf.size(); i++ )
        set.insert( recv_buf[i] );
}


/************************************************************************
 *  mapGather                                                            *
 ************************************************************************/
template<class KEY, class DATA>
inline void MPI_CLASS::mapGather( std::map<KEY, DATA> &map ) const
{
    std::vector<KEY> send_id;
    std::vector<DATA> send_data;
    send_id.reserve( map.size() );
    send_data.reserve( map.size() );
    for ( auto it = map.begin(); it != map.end(); ++it ) {
        send_id.push_back( it->first );
        send_data.push_back( it->second );
    }
    int send_size = (int) send_id.size();
    std::vector<int> recv_cnt( this->comm_size, 0 );
    this->allGather<int>( send_size, &recv_cnt[0] );
    std::vector<int> recv_disp( this->comm_size, 0 );
    for ( int i = 1; i < this->comm_size; i++ )
        recv_disp[i] = recv_disp[i - 1] + recv_cnt[i - 1];
    size_t N_recv_tot = 0;
    for ( int i = 0; i < this->comm_size; i++ )
        N_recv_tot += recv_cnt[i];
    if ( N_recv_tot == 0 )
        return;
    std::vector<KEY> recv_id( N_recv_tot );
    std::vector<DATA> recv_data( N_recv_tot );
    KEY *send_data1  = nullptr;
    DATA *send_data2 = nullptr;
    if ( send_id.size() > 0 ) {
        send_data1 = &send_id[0];
        send_data2 = &send_data[0];
    }
    static_assert( is_mpi_copyable<DATA>(), "Object is not trivially copyable" );
    this->allGather<KEY>( send_data1, send_size, &recv_id[0], &recv_cnt[0], &recv_disp[0], true );
    this->allGather<DATA>(
        send_data2, send_size, &recv_data[0], &recv_cnt[0], &recv_disp[0], true );
    map = std::map<KEY, DATA>();
    for ( size_t i = 0; i < N_recv_tot; i++ )
        map.insert( std::pair<KEY, DATA>( recv_id[i], recv_data[i] ) );
}


/************************************************************************
 *  sumScan                                                              *
 ************************************************************************/
template<class TYPE>
inline void MPI_CLASS::sumScan( const TYPE *x, TYPE *y, const int n ) const
{
    if ( comm_size > 1 ) {
        call_sumScan( x, y, n );
    } else {
        for ( int i = 0; i < n; i++ )
            y[i] = x[i];
    }
}
// Define specializations of call_sumScan(const TYPE*, TYPE*, int)
#if defined( USE_MPI ) || defined( USE_EXT_MPI )
template<>
void MPI_CLASS::call_sumScan<unsigned char>( const unsigned char *, unsigned char *, int ) const;
template<>
void MPI_CLASS::call_sumScan<char>( const char *, char *, int ) const;
template<>
void MPI_CLASS::call_sumScan<unsigned int>( const unsigned int *, unsigned int *, int ) const;
template<>
void MPI_CLASS::call_sumScan<int>( const int *, int *, int ) const;
template<>
void MPI_CLASS::call_sumScan<unsigned long int>(
    const unsigned long int *, unsigned long int *, int ) const;
template<>
void MPI_CLASS::call_sumScan<long int>( const long int *, long int *, int ) const;
template<>
void MPI_CLASS::call_sumScan<size_t>( const size_t *, size_t *, int ) const;
template<>
void MPI_CLASS::call_sumScan<float>( const float *, float *, int ) const;
template<>
void MPI_CLASS::call_sumScan<double>( const double *, double *, int ) const;
template<>
void MPI_CLASS::call_sumScan<std::complex<double>>(
    const std::complex<double> *, std::complex<double> *, int ) const;
#endif
// Default instantiations of call_sumScan(const TYPE*, TYPE*, int)
template<class TYPE>
void MPI_CLASS::call_sumScan( const TYPE *, TYPE *, int ) const
{
    char message[200];
    sprintf( message, "Default instantion of sumScan in parallel is not supported (%s)",
        typeid( TYPE ).name() );
    MPI_CLASS_ERROR( message );
}


/************************************************************************
 *  minScan                                                              *
 ************************************************************************/
template<class TYPE>
inline void MPI_CLASS::minScan( const TYPE *x, TYPE *y, const int n ) const
{
    if ( comm_size > 1 ) {
        call_minScan( x, y, n );
    } else {
        for ( int i = 0; i < n; i++ )
            y[i] = x[i];
    }
}
// Define specializations of call_minScan(const TYPE*, TYPE*, int)
#if defined( USE_MPI ) || defined( USE_EXT_MPI )
template<>
void MPI_CLASS::call_minScan<unsigned char>( const unsigned char *, unsigned char *, int ) const;
template<>
void MPI_CLASS::call_minScan<char>( const char *, char *, int ) const;
template<>
void MPI_CLASS::call_minScan<unsigned int>( const unsigned int *, unsigned int *, int ) const;
template<>
void MPI_CLASS::call_minScan<int>( const int *, int *, int ) const;
template<>
void MPI_CLASS::call_minScan<unsigned long int>(
    const unsigned long int *, unsigned long int *, int ) const;
template<>
void MPI_CLASS::call_minScan<long int>( const long int *, long int *, int ) const;
template<>
void MPI_CLASS::call_minScan<size_t>( const size_t *, size_t *, int ) const;
template<>
void MPI_CLASS::call_minScan<float>( const float *, float *, int ) const;
template<>
void MPI_CLASS::call_minScan<double>( const double *, double *, int ) const;
#endif
// Default instantiations of call_minScan(const TYPE*, TYPE*, int)
template<class TYPE>
void MPI_CLASS::call_minScan( const TYPE *, TYPE *, int ) const
{
    char message[200];
    sprintf( message, "Default instantion of minScan in parallel is not supported (%s)",
        typeid( TYPE ).name() );
    MPI_CLASS_ERROR( message );
}


/************************************************************************
 *  maxScan                                                              *
 ************************************************************************/
template<class TYPE>
inline void MPI_CLASS::maxScan( const TYPE *x, TYPE *y, const int n ) const
{
    if ( comm_size > 1 ) {
        call_maxScan( x, y, n );
    } else {
        for ( int i = 0; i < n; i++ )
            y[i] = x[i];
    }
}
// Define specializations of call_maxScan(const TYPE*, TYPE*, int)
#if defined( USE_MPI ) || defined( USE_EXT_MPI )
template<>
void MPI_CLASS::call_maxScan<unsigned char>( const unsigned char *, unsigned char *, int ) const;
template<>
void MPI_CLASS::call_maxScan<char>( const char *, char *, int ) const;
template<>
void MPI_CLASS::call_maxScan<unsigned int>( const unsigned int *, unsigned int *, int ) const;
template<>
void MPI_CLASS::call_maxScan<int>( const int *, int *, int ) const;
template<>
void MPI_CLASS::call_maxScan<unsigned long int>(
    const unsigned long int *, unsigned long int *, int ) const;
template<>
void MPI_CLASS::call_maxScan<long int>( const long int *, long int *, int ) const;
template<>
void MPI_CLASS::call_maxScan<size_t>( const size_t *, size_t *, int ) const;
template<>
void MPI_CLASS::call_maxScan<float>( const float *, float *, int ) const;
template<>
void MPI_CLASS::call_maxScan<double>( const double *, double *, int ) const;
#endif
// Default instantiations of call_maxScan(const TYPE*, TYPE*, int)
template<class TYPE>
void MPI_CLASS::call_maxScan( const TYPE *, TYPE *, int ) const
{
    char message[200];
    sprintf( message, "Default instantion of maxReduce in parallel is not supported (%s)",
        typeid( TYPE ).name() );
    MPI_CLASS_ERROR( message );
}


/************************************************************************
 *  allToAll                                                             *
 ************************************************************************/
// Define specializations of allToAll(const int n, const char*, char* )
#if defined( USE_MPI ) || defined( USE_EXT_MPI )
template<>
void MPI_CLASS::allToAll<unsigned char>(
    const int n, const unsigned char *, unsigned char * ) const;
template<>
void MPI_CLASS::allToAll<char>( const int n, const char *, char * ) const;
template<>
void MPI_CLASS::allToAll<unsigned int>( const int n, const unsigned int *, unsigned int * ) const;
template<>
void MPI_CLASS::allToAll<int>( const int n, const int *, int * ) const;
template<>
void MPI_CLASS::allToAll<unsigned long int>(
    const int n, const unsigned long int *, unsigned long int * ) const;
template<>
void MPI_CLASS::allToAll<long int>( const int n, const long int *, long int * ) const;
template<>
void MPI_CLASS::allToAll<float>( const int n, const float *, float * ) const;
template<>
void MPI_CLASS::allToAll<double>( const int n, const double *, double * ) const;
#endif
// Default instantiations of allToAll(const int n, const char*, char* )
#if defined( USE_MPI ) || defined( USE_EXT_MPI )
template<class TYPE>
void MPI_CLASS::allToAll( const int n, const TYPE *send_data, TYPE *recv_data ) const
{
    static_assert( is_mpi_copyable<TYPE>(), "Object is not trivially copyable" );
    allToAll<char>( n * sizeof( TYPE ), (char *) send_data, (char *) recv_data );
}
#else
template<class TYPE>
void MPI_CLASS::allToAll( const int n, const TYPE *send_data, TYPE *recv_data ) const
{
    if ( comm_size != 1 )
        MPI_CLASS_ERROR( "Invalid size for allToAll" );
    for ( int i = 0; i < n; i++ )
        recv_data[i] = send_data[i];
}
#endif


/************************************************************************
 *  allToAll                                                             *
 ************************************************************************/
template<class TYPE>
int MPI_CLASS::allToAll( const TYPE *send_data, const int send_cnt[], const int send_disp[],
    TYPE *recv_data, int *recv_cnt, int *recv_disp, bool known_recv ) const
{
    int N_recieved = 0;
    if ( comm_size == 1 ) {
        // Special case for single-processor communicators
        if ( known_recv ) {
            if ( recv_cnt[0] != send_cnt[0] && send_cnt[0] > 0 )
                MPI_CLASS_ERROR( "Single processor send/recv are different sizes" );
        } else {
            if ( recv_cnt != nullptr )
                recv_cnt[0] = send_cnt[0];
            if ( recv_disp != nullptr )
                recv_disp[0] = send_disp[0];
        }
        for ( int i = 0; i < send_cnt[0]; i++ )
            recv_data[i + recv_disp[0]] = send_data[i + send_disp[0]];
        N_recieved = send_cnt[0];
    } else if ( known_recv ) {
        // The recieve sizes are known
        MPI_CLASS_ASSERT( recv_cnt != nullptr && recv_disp != nullptr );
        call_allToAll( send_data, send_cnt, send_disp, recv_data, recv_cnt, recv_disp );
        for ( int i = 0; i < comm_size; i++ )
            N_recieved += recv_cnt[i];
    } else {
        // The recieve sizes are not known, we need to communicate that information first
        int *recv_cnt2  = recv_cnt;
        int *recv_disp2 = recv_disp;
        if ( recv_cnt == nullptr )
            recv_cnt2 = new int[comm_size];
        if ( recv_disp == nullptr )
            recv_disp2 = new int[comm_size];
        // Communicate the size we will be recieving from each processor
        allToAll<int>( 1, send_cnt, recv_cnt2 );
        recv_disp2[0] = 0;
        for ( int i = 1; i < comm_size; i++ )
            recv_disp2[i] = recv_disp2[i - 1] + recv_cnt2[i - 1];
        // Send the data
        call_allToAll( send_data, send_cnt, send_disp, recv_data, recv_cnt2, recv_disp2 );
        for ( int i = 0; i < comm_size; i++ )
            N_recieved += recv_cnt2[i];
        if ( recv_cnt == nullptr )
            delete[] recv_cnt2;
        if ( recv_disp == nullptr )
            delete[] recv_disp2;
    }
    return N_recieved;
}
// Define specializations of call_allToAll
#if defined( USE_MPI ) || defined( USE_EXT_MPI )
template<>
void MPI_CLASS::call_allToAll<unsigned char>( const unsigned char *, const int *, const int *,
    unsigned char *, const int *, const int * ) const;
template<>
void MPI_CLASS::call_allToAll<char>(
    const char *, const int *, const int *, char *, const int *, const int * ) const;
template<>
void MPI_CLASS::call_allToAll<unsigned int>( const unsigned int *, const int *, const int *,
    unsigned int *, const int *, const int * ) const;
template<>
void MPI_CLASS::call_allToAll<int>(
    const int *, const int *, const int *, int *, const int *, const int * ) const;
template<>
void MPI_CLASS::call_allToAll<unsigned long int>( const unsigned long int *, const int *,
    const int *, unsigned long int *, const int *, const int * ) const;
template<>
void MPI_CLASS::call_allToAll<long int>(
    const long int *, const int *, const int *, long int *, const int *, const int * ) const;
template<>
void MPI_CLASS::call_allToAll<float>(
    const float *, const int *, const int *, float *, const int *, const int * ) const;
template<>
void MPI_CLASS::call_allToAll<double>(
    const double *, const int *, const int *, double *, const int *, const int * ) const;
#else
template<>
void MPI_CLASS::call_allToAll<char>(
    const char *, const int *, const int *, char *, const int *, const int * ) const;
#endif
// Default instantiations of call_allToAll
template<class TYPE>
void MPI_CLASS::call_allToAll( const TYPE *send_data, const int send_cnt[], const int send_disp[],
    TYPE *recv_data, const int *recv_cnt, const int *recv_disp ) const
{
    int *send_cnt2  = new int[comm_size];
    int *recv_cnt2  = new int[comm_size];
    int *send_disp2 = new int[comm_size];
    int *recv_disp2 = new int[comm_size];
    for ( int i = 0; i < comm_size; i++ ) {
        send_cnt2[i]  = send_cnt[i] * sizeof( TYPE );
        send_disp2[i] = send_disp[i] * sizeof( TYPE );
        recv_cnt2[i]  = recv_cnt[i] * sizeof( TYPE );
        recv_disp2[i] = recv_disp[i] * sizeof( TYPE );
    }
    static_assert( is_mpi_copyable<TYPE>(), "Object is not trivially copyable" );
    call_allToAll<char>(
        (char *) send_data, send_cnt2, send_disp2, (char *) recv_data, recv_cnt2, recv_disp2 );
    delete[] send_cnt2;
    delete[] recv_cnt2;
    delete[] send_disp2;
    delete[] recv_disp2;
}


} // namespace Utilities

#endif