LBPM/common/MPI.I

// This file contains the default instantiations for templated operations
// Note: Intel compilers need definitions before all default instantions to compile correctly
#ifndef included_MPI_I
#define included_MPI_I

#include "common/Utilities.h"

#include <typeinfo>

#define MPI_CLASS MPI
#define MPI_CLASS_ERROR ERROR
#define MPI_CLASS_ASSERT ASSERT

#undef NULL_USE
#define NULL_USE(variable)                                                     \
    do {                                                                       \
        if (0) {                                                               \
            auto static t = (char *)&variable;                                 \
            t++;                                                               \
        }                                                                      \
    } while (0)

namespace Utilities {

// Function to test if a type is a std::pair
template <typename> struct is_pair : std::false_type {};
template <typename T, typename U>
struct is_pair<std::pair<T, U>> : std::true_type {};

// Function to test if a type can be passed by MPI
template <class TYPE>
constexpr
    typename std::enable_if<std::is_trivially_copyable<TYPE>::value, bool>::type
    is_mpi_copyable() {
    return true;
}
template <class TYPE>
constexpr typename std::enable_if<!std::is_trivially_copyable<TYPE>::value &&
                                      is_pair<TYPE>::value,
                                  bool>::type
is_mpi_copyable() {
    return is_mpi_copyable<typename TYPE::first_type>() &&
           is_mpi_copyable<typename TYPE::second_type>();
}
template <class TYPE>
constexpr typename std::enable_if<!std::is_trivially_copyable<TYPE>::value &&
                                      !is_pair<TYPE>::value,
                                  bool>::type
is_mpi_copyable() {
    return false;
}

/************************************************************************
 *  sumReduce                                                            *
 ************************************************************************/
template <class TYPE> inline TYPE MPI_CLASS::sumReduce(const TYPE value) const {
    if (comm_size > 1) {
        TYPE tmp = value;
        call_sumReduce(&tmp, 1);
        return tmp;
    } else {
        return value;
    }
}
template <class TYPE> inline void MPI_CLASS::sumReduce(TYPE *x, int n) const {
    if (comm_size > 1)
        call_sumReduce(x, n);
}
template <class TYPE>
inline void MPI_CLASS::sumReduce(const TYPE *x, TYPE *y, int n) const {
    if (comm_size > 1) {
        call_sumReduce(x, y, n);
    } else {
        for (int i = 0; i < n; i++)
            y[i] = x[i];
    }
}
// Define specializations of call_sumReduce(TYPE*, int)
#if defined(USE_MPI) || defined(USE_EXT_MPI)
template <>
void MPI_CLASS::call_sumReduce<unsigned char>(unsigned char *, int) const;
template <> void MPI_CLASS::call_sumReduce<char>(char *, int) const;
template <>
void MPI_CLASS::call_sumReduce<unsigned int>(unsigned int *, int) const;
template <> void MPI_CLASS::call_sumReduce<int>(int *, int) const;
template <>
void MPI_CLASS::call_sumReduce<unsigned long int>(unsigned long int *,
                                                  int) const;
template <> void MPI_CLASS::call_sumReduce<long int>(long int *, int) const;
template <> void MPI_CLASS::call_sumReduce<size_t>(size_t *, int) const;
template <> void MPI_CLASS::call_sumReduce<float>(float *, int) const;
template <> void MPI_CLASS::call_sumReduce<double>(double *, int) const;
template <>
void MPI_CLASS::call_sumReduce<std::complex<double>>(std::complex<double> *,
                                                     int) const;
#endif
// Default instantiations of call_sumReduce(TYPE*, int)
template <class TYPE> void MPI_CLASS::call_sumReduce(TYPE *, int) const {
    char message[200];
    sprintf(message,
            "Default instantion of sumReduce in parallel is not supported (%s)",
            typeid(TYPE).name());
    MPI_CLASS_ERROR(message);
}
// Define specializations of call_sumReduce(const TYPE*, TYPE*, int)
#if defined(USE_MPI) || defined(USE_EXT_MPI)
template <>
void MPI_CLASS::call_sumReduce<unsigned char>(const unsigned char *,
                                              unsigned char *, int) const;
template <>
void MPI_CLASS::call_sumReduce<char>(const char *, char *, int) const;
template <>
void MPI_CLASS::call_sumReduce<unsigned int>(const unsigned int *,
                                             unsigned int *, int) const;
template <> void MPI_CLASS::call_sumReduce<int>(const int *, int *, int) const;
template <>
void MPI_CLASS::call_sumReduce<unsigned long int>(const unsigned long int *,
                                                  unsigned long int *,
                                                  int) const;
template <>
void MPI_CLASS::call_sumReduce<long int>(const long int *, long int *,
                                         int) const;
template <>
void MPI_CLASS::call_sumReduce<size_t>(const size_t *, size_t *, int) const;
template <>
void MPI_CLASS::call_sumReduce<float>(const float *, float *, int) const;
template <>
void MPI_CLASS::call_sumReduce<double>(const double *, double *, int) const;
template <>
void MPI_CLASS::call_sumReduce<std::complex<double>>(
    const std::complex<double> *, std::complex<double> *, int) const;
#endif
// Default instantiations of call_sumReduce(const TYPE*, TYPE*, int)
template <class TYPE>
void MPI_CLASS::call_sumReduce(const TYPE *x, TYPE *y, int n) const {
    NULL_USE(x);
    NULL_USE(y);
    NULL_USE(n);
    char message[200];
    sprintf(message,
            "Default instantion of sumReduce in parallel is not supported (%s)",
            typeid(TYPE).name());
    MPI_CLASS_ERROR(message);
}

/************************************************************************
 *  minReduce                                                            *
 ************************************************************************/
template <class TYPE> inline TYPE MPI_CLASS::minReduce(const TYPE value) const {
    if (comm_size > 1) {
        TYPE tmp = value;
        call_minReduce(&tmp, 1, nullptr);
        return tmp;
    } else {
        return value;
    }
}
template <class TYPE>
inline void MPI_CLASS::minReduce(TYPE *x, int n, int *rank_of_min) const {
    if (comm_size > 1) {
        call_minReduce(x, n, rank_of_min);
    } else {
        if (rank_of_min != nullptr) {
            for (int i = 0; i < n; i++)
                rank_of_min[i] = 0;
        }
    }
}
template <class TYPE>
inline void MPI_CLASS::minReduce(const TYPE *x, TYPE *y, int n,
                                 int *rank_of_min) const {
    if (comm_size > 1) {
        call_minReduce(x, y, n, rank_of_min);
    } else {
        for (int i = 0; i < n; i++) {
            y[i] = x[i];
            if (rank_of_min != nullptr)
                rank_of_min[i] = 0;
        }
    }
}
// Define specializations of call_minReduce(TYPE*, int, int*)
#if defined(USE_MPI) || defined(USE_EXT_MPI)
template <>
void MPI_CLASS::call_minReduce<unsigned char>(unsigned char *, int,
                                              int *) const;
template <> void MPI_CLASS::call_minReduce<char>(char *, int, int *) const;
template <>
void MPI_CLASS::call_minReduce<unsigned int>(unsigned int *, int, int *) const;
template <> void MPI_CLASS::call_minReduce<int>(int *, int, int *) const;
template <>
void MPI_CLASS::call_minReduce<unsigned long int>(unsigned long int *, int,
                                                  int *) const;
template <>
void MPI_CLASS::call_minReduce<long int>(long int *, int, int *) const;
template <>
void MPI_CLASS::call_minReduce<unsigned long long int>(unsigned long long int *,
                                                       int, int *) const;
template <>
void MPI_CLASS::call_minReduce<long long int>(long long int *, int,
                                              int *) const;
template <> void MPI_CLASS::call_minReduce<size_t>(size_t *, int, int *) const;
template <> void MPI_CLASS::call_minReduce<float>(float *, int, int *) const;
template <> void MPI_CLASS::call_minReduce<double>(double *, int, int *) const;
#endif
// Default instantiations of call_minReduce(TYPE*, int, int*)
template <class TYPE> void MPI_CLASS::call_minReduce(TYPE *, int, int *) const {
    char message[200];
    sprintf(message,
            "Default instantion of minReduce in parallel is not supported (%s)",
            typeid(TYPE).name());
    MPI_CLASS_ERROR(message);
}
// Define specializations of call_minReduce(const TYPE*, TYPE*, int, int*)
#if defined(USE_MPI) || defined(USE_EXT_MPI)
template <>
void MPI_CLASS::call_minReduce<unsigned char>(const unsigned char *,
                                              unsigned char *, int,
                                              int *) const;
template <>
void MPI_CLASS::call_minReduce<char>(const char *, char *, int, int *) const;
template <>
void MPI_CLASS::call_minReduce<unsigned int>(const unsigned int *,
                                             unsigned int *, int, int *) const;
template <>
void MPI_CLASS::call_minReduce<int>(const int *, int *, int, int *) const;
template <>
void MPI_CLASS::call_minReduce<unsigned long int>(const unsigned long int *,
                                                  unsigned long int *, int,
                                                  int *) const;
template <>
void MPI_CLASS::call_minReduce<long int>(const long int *, long int *, int,
                                         int *) const;
template <>
void MPI_CLASS::call_minReduce<unsigned long long int>(
    const unsigned long long int *, unsigned long long int *, int, int *) const;
template <>
void MPI_CLASS::call_minReduce<long long int>(const long long int *,
                                              long long int *, int,
                                              int *) const;
template <>
void MPI_CLASS::call_minReduce<size_t>(const size_t *, size_t *, int,
                                       int *) const;
template <>
void MPI_CLASS::call_minReduce<float>(const float *, float *, int, int *) const;
template <>
void MPI_CLASS::call_minReduce<double>(const double *, double *, int,
                                       int *) const;
#endif
// Default instantiations of call_minReduce(const TYPE*, TYPE*, int, int*)
template <class TYPE>
void MPI_CLASS::call_minReduce(const TYPE *, TYPE *, int, int *) const {
    char message[200];
    sprintf(message,
            "Default instantion of minReduce in parallel is not supported (%s)",
            typeid(TYPE).name());
    MPI_CLASS_ERROR(message);
}

/************************************************************************
 *  maxReduce                                                         *
 ************************************************************************/
template <class TYPE> inline TYPE MPI_CLASS::maxReduce(const TYPE value) const {
    if (comm_size > 1) {
        TYPE tmp = value;
        call_maxReduce(&tmp, 1, nullptr);
        return tmp;
    } else {
        return value;
    }
}
template <class TYPE>
inline void MPI_CLASS::maxReduce(TYPE *x, int n, int *rank_of_max) const {
    if (comm_size > 1) {
        call_maxReduce(x, n, rank_of_max);
    } else {
        if (rank_of_max != nullptr) {
            for (int i = 0; i < n; i++)
                rank_of_max[i] = 0;
        }
    }
}
template <class TYPE>
inline void MPI_CLASS::maxReduce(const TYPE *x, TYPE *y, int n,
                                 int *rank_of_max) const {
    if (comm_size > 1) {
        call_maxReduce(x, y, n, rank_of_max);
    } else {
        for (int i = 0; i < n; i++) {
            y[i] = x[i];
            if (rank_of_max != nullptr)
                rank_of_max[i] = 0;
        }
    }
}
// Define specializations of call_maxReduce(TYPE*, int, int*)
#if defined(USE_MPI) || defined(USE_EXT_MPI)
template <>
void MPI_CLASS::call_maxReduce<unsigned char>(unsigned char *, int,
                                              int *) const;
template <> void MPI_CLASS::call_maxReduce<char>(char *, int, int *) const;
template <>
void MPI_CLASS::call_maxReduce<unsigned int>(unsigned int *, int, int *) const;
template <> void MPI_CLASS::call_maxReduce<int>(int *, int, int *) const;
template <>
void MPI_CLASS::call_maxReduce<unsigned long int>(unsigned long int *, int,
                                                  int *) const;
template <>
void MPI_CLASS::call_maxReduce<long int>(long int *, int, int *) const;
template <>
void MPI_CLASS::call_maxReduce<unsigned long long int>(unsigned long long int *,
                                                       int, int *) const;
template <>
void MPI_CLASS::call_maxReduce<long long int>(long long int *, int,
                                              int *) const;
template <> void MPI_CLASS::call_maxReduce<size_t>(size_t *, int, int *) const;
template <> void MPI_CLASS::call_maxReduce<float>(float *, int, int *) const;
template <> void MPI_CLASS::call_maxReduce<double>(double *, int, int *) const;
#endif
// Default instantiations of call_maxReduce(TYPE*, int, int*)
template <class TYPE> void MPI_CLASS::call_maxReduce(TYPE *, int, int *) const {
    char message[200];
    sprintf(message,
            "Default instantion of maxReduce in parallel is not supported (%s)",
            typeid(TYPE).name());
    MPI_CLASS_ERROR(message);
}
// Define specializations of call_maxReduce(const TYPE*, TYPE*, int, int*)
#if defined(USE_MPI) || defined(USE_EXT_MPI)
template <>
void MPI_CLASS::call_maxReduce<unsigned char>(const unsigned char *,
                                              unsigned char *, int,
                                              int *) const;
template <>
void MPI_CLASS::call_maxReduce<char>(const char *, char *, int, int *) const;
template <>
void MPI_CLASS::call_maxReduce<unsigned int>(const unsigned int *,
                                             unsigned int *, int, int *) const;
template <>
void MPI_CLASS::call_maxReduce<int>(const int *, int *, int, int *) const;
template <>
void MPI_CLASS::call_maxReduce<unsigned long int>(const unsigned long int *,
                                                  unsigned long int *, int,
                                                  int *) const;
template <>
void MPI_CLASS::call_maxReduce<long int>(const long int *, long int *, int,
                                         int *) const;
template <>
void MPI_CLASS::call_maxReduce<unsigned long long int>(
    const unsigned long long int *, unsigned long long int *, int, int *) const;
template <>
void MPI_CLASS::call_maxReduce<long long int>(const long long int *,
                                              long long int *, int,
                                              int *) const;
template <>
void MPI_CLASS::call_maxReduce<size_t>(const size_t *, size_t *, int,
                                       int *) const;
template <>
void MPI_CLASS::call_maxReduce<float>(const float *, float *, int, int *) const;
template <>
void MPI_CLASS::call_maxReduce<double>(const double *, double *, int,
                                       int *) const;
#endif
// Default instantiations of call_maxReduce(const TYPE*, TYPE*, int, int*)
template <class TYPE>
void MPI_CLASS::call_maxReduce(const TYPE *, TYPE *, int, int *) const {
    char message[200];
    sprintf(message,
            "Default instantion of maxReduce in parallel is not supported (%s)",
            typeid(TYPE).name());
    MPI_CLASS_ERROR(message);
}

/************************************************************************
 *  bcast                                                                *
 ************************************************************************/
// Define specializations of bcast(TYPE*, int, int)
#if defined(USE_MPI) || defined(USE_EXT_MPI)
template <>
void MPI_CLASS::call_bcast<unsigned char>(unsigned char *, int, int) const;
template <> void MPI_CLASS::call_bcast<char>(char *, int, int) const;
template <>
void MPI_CLASS::call_bcast<unsigned int>(unsigned int *, int, int) const;
template <> void MPI_CLASS::call_bcast<int>(int *, int, int) const;
template <> void MPI_CLASS::call_bcast<float>(float *, int, int) const;
template <> void MPI_CLASS::call_bcast<double>(double *, int, int) const;
#else
template <> void MPI_CLASS::call_bcast<char>(char *, int, int) const;
#endif
// Default instantiations of bcast(TYPE*, int, int)
template <class TYPE>
void MPI_CLASS::call_bcast(TYPE *x, int n, int root) const {
    static_assert(is_mpi_copyable<TYPE>(), "Object is not trivially copyable");
    call_bcast<char>((char *)x, (int)n * sizeof(TYPE), root);
}
// Specialization of bcast for std::string
template <>
inline std::string MPI_CLASS::bcast<std::string>(const std::string &value,
                                                 int root) const {
    if (comm_size == 1)
        return value;
    int length = static_cast<int>(value.size());
    call_bcast<int>(&length, 1, root);
    if (length == 0)
        return std::string();
    char *str = new char[length + 1];
    if (root == comm_rank) {
        for (int i = 0; i < length; i++)
            str[i] = value[i];
    }
    call_bcast<char>(str, length, root);
    str[length] = 0;
    std::string result(str);
    delete[] str;
    return result;
}
template <>
inline void MPI_CLASS::bcast<std::string>(std::string *, int, int) const {
    MPI_CLASS_ERROR("Cannot bcast an array of strings");
}
// Default implimentation of bcast
template <class TYPE>
inline TYPE MPI_CLASS::bcast(const TYPE &value, int root) const {
    if (root >= comm_size)
        MPI_CLASS_ERROR("root cannot be >= size in bcast");
    if (comm_size > 1) {
        TYPE tmp = value;
        call_bcast(&tmp, 1, root);
        return tmp;
    } else {
        return value;
    }
}
template <class TYPE>
inline void MPI_CLASS::bcast(TYPE *x, int n, int root) const {
    if (root >= comm_size)
        MPI_CLASS_ERROR("root cannot be >= size in bcast");
    if (comm_size > 1)
        call_bcast(x, n, root);
}

/************************************************************************
 *  send                                                                 *
 ************************************************************************/
// Define specializations of send(const TYPE*, int, int, int)
#if defined(USE_MPI) || defined(USE_EXT_MPI)
template <> void MPI_CLASS::send<char>(const char *, int, int, int) const;
template <> void MPI_CLASS::send<int>(const int *, int, int, int) const;
template <> void MPI_CLASS::send<float>(const float *, int, int, int) const;
template <> void MPI_CLASS::send<double>(const double *, int, int, int) const;
#else
template <> void MPI_CLASS::send<char>(const char *, int, int, int) const;
#endif
// Default instantiations of send(const TYPE*, int, int, int)
template <class TYPE>
inline void MPI_CLASS::send(const TYPE *buf, int length, int recv_proc_number,
                            int tag) const {
    static_assert(is_mpi_copyable<TYPE>(), "Object is not trivially copyable");
    send<char>((const char *)buf, length * sizeof(TYPE), recv_proc_number, tag);
}

/************************************************************************
 *  Isend                                                                *
 ************************************************************************/
// Define specializations of Isend(const TYPE*, int, int, int)
#if defined(USE_MPI) || defined(USE_EXT_MPI)
template <>
MPI_Request MPI_CLASS::Isend<char>(const char *, int, int, int) const;
template <> MPI_Request MPI_CLASS::Isend<int>(const int *, int, int, int) const;
template <>
MPI_Request MPI_CLASS::Isend<float>(const float *, int, int, int) const;
template <>
MPI_Request MPI_CLASS::Isend<double>(const double *, int, int, int) const;
#else
template <>
MPI_Request MPI_CLASS::Isend<char>(const char *, int, int, int) const;
#endif
// Default instantiations of Isend(const TYPE*, int, int, int)
template <class TYPE>
inline MPI_Request MPI_CLASS::Isend(const TYPE *buf, int length,
                                    int recv_proc_number, int tag) const {
    static_assert(is_mpi_copyable<TYPE>(), "Object is not trivially copyable");
    return Isend<char>((const char *)buf, length * sizeof(TYPE),
                       recv_proc_number, tag);
}

/************************************************************************
 *  recv                                                                 *
 ************************************************************************/
// Define specializations of recv(TYPE*, int&, int, const bool, int)
#if defined(USE_MPI) || defined(USE_EXT_MPI)
template <>
void MPI_CLASS::recv<char>(char *, int &, int, const bool, int) const;
template <> void MPI_CLASS::recv<int>(int *, int &, int, const bool, int) const;
template <>
void MPI_CLASS::recv<float>(float *, int &, int, const bool, int) const;
template <>
void MPI_CLASS::recv<double>(double *, int &, int, const bool, int) const;
#else
template <>
void MPI_CLASS::recv<char>(char *, int &, int, const bool, int) const;
#endif
// Default instantiations of recv(TYPE*, int&, int, const bool, int)
template <class TYPE>
inline void MPI_CLASS::recv(TYPE *buf, int &length, int send_proc_number,
                            const bool get_length, int tag) const {
    static_assert(is_mpi_copyable<TYPE>(), "Object is not trivially copyable");
    int size = length * sizeof(TYPE);
    recv<char>((char *)buf, size, send_proc_number, get_length, tag);
    if (get_length) {
        MPI_CLASS_ASSERT(size % sizeof(TYPE) == 0);
        length = size / sizeof(TYPE);
    }
}

/************************************************************************
 *  Irecv                                                                *
 ************************************************************************/
// Define specializations of recv(TYPE*, int&, int, const bool, int)
#if defined(USE_MPI) || defined(USE_EXT_MPI)
template <> MPI_Request MPI_CLASS::Irecv<char>(char *, int, int, int) const;
template <> MPI_Request MPI_CLASS::Irecv<int>(int *, int, int, int) const;
template <> MPI_Request MPI_CLASS::Irecv<float>(float *, int, int, int) const;
template <> MPI_Request MPI_CLASS::Irecv<double>(double *, int, int, int) const;
#else
template <> MPI_Request MPI_CLASS::Irecv<char>(char *, int, int, int) const;
#endif
// Default instantiations of recv(TYPE*, int&, int, const bool, int)
template <class TYPE>
inline MPI_Request MPI_CLASS::Irecv(TYPE *buf, int length, int send_proc,
                                    int tag) const {
    static_assert(is_mpi_copyable<TYPE>(), "Object is not trivially copyable");
    return Irecv<char>((char *)buf, length * sizeof(TYPE), send_proc, tag);
}

/************************************************************************
 *  sendrecv                                                                 *
 ************************************************************************/
#if defined(USE_MPI) || defined(USE_EXT_MPI)
template <>
void MPI_CLASS::sendrecv<char>(const char *, int, int, int, char *, int, int,
                               int) const;
template <>
void MPI_CLASS::sendrecv<int>(const int *, int, int, int, int *, int, int,
                              int) const;
template <>
void MPI_CLASS::sendrecv<float>(const float *, int, int, int, float *, int, int,
                                int) const;
template <>
void MPI_CLASS::sendrecv<double>(const double *, int, int, int, double *, int,
                                 int, int) const;
template <class TYPE>
void MPI_CLASS::sendrecv(const TYPE *sendbuf, int sendcount, int dest,
                         int sendtag, TYPE *recvbuf, int recvcount, int source,
                         int recvtag) const {
    if (getSize() == 1) {
        ASSERT(dest == 0);
        ASSERT(source == 0);
        ASSERT(sendcount == recvcount);
        ASSERT(sendtag == recvtag);
        memcpy(recvbuf, sendbuf, sendcount * sizeof(TYPE));
    } else {
        ERROR("Not implimented for " + std::string(typeid(TYPE).name()));
    }
}
#else
template <class TYPE>
void MPI_CLASS::sendrecv(const TYPE *sendbuf, int sendcount, int dest,
                         int sendtag, TYPE *recvbuf, int recvcount, int source,
                         int recvtag) const {
    ASSERT(dest == 0);
    ASSERT(source == 0);
    ASSERT(sendcount == recvcount);
    ASSERT(sendtag == recvtag);
    memcpy(recvbuf, sendbuf, sendcount * sizeof(TYPE));
}
#endif

/************************************************************************
 *  allGather                                                            *
 ************************************************************************/
template <class TYPE>
std::vector<TYPE> MPI_CLASS::allGather(const TYPE &x) const {
    static_assert(is_mpi_copyable<TYPE>(), "Object is not trivially copyable");
    if (getSize() <= 1)
        return std::vector<TYPE>(1, x);
    std::vector<TYPE> data(getSize());
    allGather(x, data.data());
    return data;
}
template <class TYPE>
std::vector<TYPE> MPI_CLASS::allGather(const std::vector<TYPE> &x) const {
    static_assert(is_mpi_copyable<TYPE>(), "Object is not trivially copyable");
    if (getSize() <= 1)
        return x;
    std::vector<int> count = allGather<int>(x.size());
    std::vector<int> disp(getSize(), 0);
    size_t N = count[0];
    for (size_t i = 1; i < count.size(); i++) {
        disp[i] = disp[i - 1] + count[i - 1];
        N += count[i];
    }
    std::vector<TYPE> data(N);
    allGather<TYPE>(x.data(), x.size(), data.data(), count.data(), disp.data(),
                    true);
    return data;
}
// Specialization of MPI_CLASS::allGather for std::string
template <>
inline void MPI_CLASS::allGather<std::string>(const std::string &x_in,
                                              std::string *x_out) const {
    // Get the bytes recvied per processor
    std::vector<int> recv_cnt(comm_size, 0);
    allGather<int>((int)x_in.size() + 1, &recv_cnt[0]);
    std::vector<int> recv_disp(comm_size, 0);
    for (int i = 1; i < comm_size; i++)
        recv_disp[i] = recv_disp[i - 1] + recv_cnt[i - 1];
    // Call the vector form of allGather for the char arrays
    char *recv_data =
        new char[recv_disp[comm_size - 1] + recv_cnt[comm_size - 1]];
    allGather<char>(x_in.c_str(), (int)x_in.size() + 1, recv_data, &recv_cnt[0],
                    &recv_disp[0], true);
    for (int i = 0; i < comm_size; i++)
        x_out[i] = std::string(&recv_data[recv_disp[i]]);
    delete[] recv_data;
}
// Default instantiation of MPI_CLASS::allGather
template <class TYPE>
inline void MPI_CLASS::allGather(const TYPE &x_in, TYPE *x_out) const {
    static_assert(is_mpi_copyable<TYPE>(), "Object is not trivially copyable");
    if (comm_size > 1) {
        // We can use the vector form of allGather with a char array to ge the data we want
        call_allGather(x_in, x_out);
    } else {
        // Single processor case
        x_out[0] = x_in;
    }
}
// Specialization of MPI_CLASS::allGather for std::string
template <>
inline int MPI_CLASS::allGather<std::string>(const std::string *, int,
                                             std::string *, int *, int *,
                                             bool) const {
    MPI_CLASS_ERROR("Cannot allGather an array of strings");
    return 0;
}
// Define specializations of call_allGather(const TYPE, TYPE*)
#if defined(USE_MPI) || defined(USE_EXT_MPI)
template <>
void MPI_CLASS::call_allGather<unsigned char>(const unsigned char &,
                                              unsigned char *) const;
template <> void MPI_CLASS::call_allGather<char>(const char &, char *) const;
template <>
void MPI_CLASS::call_allGather<unsigned int>(const unsigned int &,
                                             unsigned int *) const;
template <> void MPI_CLASS::call_allGather<int>(const int &, int *) const;
template <>
void MPI_CLASS::call_allGather<unsigned long int>(const unsigned long int &,
                                                  unsigned long int *) const;
template <>
void MPI_CLASS::call_allGather<long int>(const long int &, long int *) const;
template <> void MPI_CLASS::call_allGather<float>(const float &, float *) const;
template <>
void MPI_CLASS::call_allGather<double>(const double &, double *) const;
#endif
// Default instantiation of MPI_CLASS::allGather
template <class TYPE>
int MPI_CLASS::allGather(const TYPE *send_data, int send_cnt, TYPE *recv_data,
                         int *recv_cnt, int *recv_disp, bool known_recv) const {
    // Check the inputs
    if (known_recv && (recv_cnt == nullptr || recv_disp == nullptr))
        MPI_CLASS_ERROR("Error calling allGather");
    // Check if we are dealing with a single processor
    if (comm_size == 1) {
        if (send_data == nullptr && send_cnt > 0) {
            MPI_CLASS_ERROR("send_data is null");
        } else if (!known_recv) {
            // We do not know the recieved sizes
            for (int i = 0; i < send_cnt; i++)
                recv_data[i] = send_data[i];
            if (recv_cnt != nullptr)
                recv_cnt[0] = send_cnt;
            if (recv_disp != nullptr)
                recv_disp[0] = 0;
        } else {
            // We know the recieved sizes
            for (int i = 0; i < send_cnt; i++)
                recv_data[i + recv_disp[0]] = send_data[i];
        }
        return send_cnt;
    }
    // Get the sizes of the recieved data (if necessary)
    int *recv_cnt2 = recv_cnt;
    int *recv_disp2 = recv_disp;
    if (!known_recv) {
        if (recv_cnt == nullptr)
            recv_cnt2 = new int[comm_size];
        if (recv_disp == nullptr)
            recv_disp2 = new int[comm_size];
        call_allGather(send_cnt, recv_cnt2);
        recv_disp2[0] = 0;
        for (int i = 1; i < comm_size; i++)
            recv_disp2[i] = recv_disp2[i - 1] + recv_cnt2[i - 1];
    }
    int N_recv = 0;
    for (int i = 0; i < comm_size; i++)
        N_recv += recv_cnt2[i];
    // Send/recv the data
    call_allGather(send_data, send_cnt, recv_data, recv_cnt2, recv_disp2);
    // Delete any temporary memory
    if (recv_cnt == nullptr)
        delete[] recv_cnt2;
    if (recv_disp == nullptr)
        delete[] recv_disp2;
    return N_recv;
}
// Default instantiations of call_allGather(const TYPE, TYPE*)
template <class TYPE>
void MPI_CLASS::call_allGather(const TYPE &x_in, TYPE *x_out) const {
    static_assert(is_mpi_copyable<TYPE>(), "Object is not trivially copyable");
    allGather<char>((const char *)&x_in, (int)sizeof(TYPE), (char *)x_out);
}
// Define specializations of call_allGather(const TYPE*, int, TYPE*, int*, int*)
#if defined(USE_MPI) || defined(USE_EXT_MPI)
template <>
void MPI_CLASS::call_allGather<unsigned char>(const unsigned char *, int,
                                              unsigned char *, int *,
                                              int *) const;
template <>
void MPI_CLASS::call_allGather<char>(const char *, int, char *, int *,
                                     int *) const;
template <>
void MPI_CLASS::call_allGather<unsigned int>(const unsigned int *, int,
                                             unsigned int *, int *,
                                             int *) const;
template <>
void MPI_CLASS::call_allGather<int>(const int *, int, int *, int *,
                                    int *) const;
template <>
void MPI_CLASS::call_allGather<unsigned long int>(const unsigned long int *,
                                                  int, unsigned long int *,
                                                  int *, int *) const;
template <>
void MPI_CLASS::call_allGather<long int>(const long int *, int, long int *,
                                         int *, int *) const;
template <>
void MPI_CLASS::call_allGather<float>(const float *, int, float *, int *,
                                      int *) const;
template <>
void MPI_CLASS::call_allGather<double>(const double *, int, double *, int *,
                                       int *) const;
#else
template <>
void MPI_CLASS::call_allGather<char>(const char *, int, char *, int *,
                                     int *) const;
#endif
// Default instantiations of int call_allGather(const TYPE*, int, TYPE*, int*)
template <class TYPE>
void MPI_CLASS::call_allGather(const TYPE *x_in, int size_in, TYPE *x_out,
                               int *size_out, int *disp_out) const {
    int *size2 = new int[comm_size];
    int *disp2 = new int[comm_size];
    for (int i = 0; i < comm_size; i++) {
        size2[i] = size_out[i] * sizeof(TYPE);
        disp2[i] = disp_out[i] * sizeof(TYPE);
    }
    static_assert(is_mpi_copyable<TYPE>(), "Object is not trivially copyable");
    call_allGather<char>((const char *)x_in, (int)size_in * sizeof(TYPE),
                         (char *)x_out, size2, disp2);
    delete[] size2;
    delete[] disp2;
}

/************************************************************************
 *  setGather                                                            *
 ************************************************************************/
template <class TYPE>
inline void MPI_CLASS::setGather(std::set<TYPE> &set) const {
    std::vector<TYPE> send_buf(set.begin(), set.end());
    std::vector<int> recv_cnt(this->comm_size, 0);
    this->allGather<int>((int)send_buf.size(), &recv_cnt[0]);
    std::vector<int> recv_disp(this->comm_size, 0);
    for (int i = 1; i < this->comm_size; i++)
        recv_disp[i] = recv_disp[i - 1] + recv_cnt[i - 1];
    size_t N_recv_tot = 0;
    for (int i = 0; i < this->comm_size; i++)
        N_recv_tot += recv_cnt[i];
    if (N_recv_tot == 0)
        return;
    std::vector<TYPE> recv_buf(N_recv_tot);
    TYPE *send_data = nullptr;
    if (send_buf.size() > 0) {
        send_data = &send_buf[0];
    }
    TYPE *recv_data = &recv_buf[0];
    static_assert(is_mpi_copyable<TYPE>(), "Object is not trivially copyable");
    this->allGather<TYPE>(send_data, (int)send_buf.size(), recv_data,
                          &recv_cnt[0], &recv_disp[0], true);
    for (size_t i = 0; i < recv_buf.size(); i++)
        set.insert(recv_buf[i]);
}

/************************************************************************
 *  mapGather                                                            *
 ************************************************************************/
template <class KEY, class DATA>
inline void MPI_CLASS::mapGather(std::map<KEY, DATA> &map) const {
    std::vector<KEY> send_id;
    std::vector<DATA> send_data;
    send_id.reserve(map.size());
    send_data.reserve(map.size());
    for (auto it = map.begin(); it != map.end(); ++it) {
        send_id.push_back(it->first);
        send_data.push_back(it->second);
    }
    int send_size = (int)send_id.size();
    std::vector<int> recv_cnt(this->comm_size, 0);
    this->allGather<int>(send_size, &recv_cnt[0]);
    std::vector<int> recv_disp(this->comm_size, 0);
    for (int i = 1; i < this->comm_size; i++)
        recv_disp[i] = recv_disp[i - 1] + recv_cnt[i - 1];
    size_t N_recv_tot = 0;
    for (int i = 0; i < this->comm_size; i++)
        N_recv_tot += recv_cnt[i];
    if (N_recv_tot == 0)
        return;
    std::vector<KEY> recv_id(N_recv_tot);
    std::vector<DATA> recv_data(N_recv_tot);
    KEY *send_data1 = nullptr;
    DATA *send_data2 = nullptr;
    if (send_id.size() > 0) {
        send_data1 = &send_id[0];
        send_data2 = &send_data[0];
    }
    static_assert(is_mpi_copyable<DATA>(), "Object is not trivially copyable");
    this->allGather<KEY>(send_data1, send_size, &recv_id[0], &recv_cnt[0],
                         &recv_disp[0], true);
    this->allGather<DATA>(send_data2, send_size, &recv_data[0], &recv_cnt[0],
                          &recv_disp[0], true);
    map = std::map<KEY, DATA>();
    for (size_t i = 0; i < N_recv_tot; i++)
        map.insert(std::pair<KEY, DATA>(recv_id[i], recv_data[i]));
}

/************************************************************************
 *  sumScan                                                              *
 ************************************************************************/
template <class TYPE>
inline void MPI_CLASS::sumScan(const TYPE *x, TYPE *y, int n) const {
    if (comm_size > 1) {
        call_sumScan(x, y, n);
    } else {
        for (int i = 0; i < n; i++)
            y[i] = x[i];
    }
}
// Define specializations of call_sumScan(const TYPE*, TYPE*, int)
#if defined(USE_MPI) || defined(USE_EXT_MPI)
template <>
void MPI_CLASS::call_sumScan<unsigned char>(const unsigned char *,
                                            unsigned char *, int) const;
template <> void MPI_CLASS::call_sumScan<char>(const char *, char *, int) const;
template <>
void MPI_CLASS::call_sumScan<unsigned int>(const unsigned int *, unsigned int *,
                                           int) const;
template <> void MPI_CLASS::call_sumScan<int>(const int *, int *, int) const;
template <>
void MPI_CLASS::call_sumScan<unsigned long int>(const unsigned long int *,
                                                unsigned long int *, int) const;
template <>
void MPI_CLASS::call_sumScan<long int>(const long int *, long int *, int) const;
template <>
void MPI_CLASS::call_sumScan<size_t>(const size_t *, size_t *, int) const;
template <>
void MPI_CLASS::call_sumScan<float>(const float *, float *, int) const;
template <>
void MPI_CLASS::call_sumScan<double>(const double *, double *, int) const;
template <>
void MPI_CLASS::call_sumScan<std::complex<double>>(const std::complex<double> *,
                                                   std::complex<double> *,
                                                   int) const;
#endif
// Default instantiations of call_sumScan(const TYPE*, TYPE*, int)
template <class TYPE>
void MPI_CLASS::call_sumScan(const TYPE *, TYPE *, int) const {
    char message[200];
    sprintf(message,
            "Default instantion of sumScan in parallel is not supported (%s)",
            typeid(TYPE).name());
    MPI_CLASS_ERROR(message);
}

/************************************************************************
 *  minScan                                                              *
 ************************************************************************/
template <class TYPE>
inline void MPI_CLASS::minScan(const TYPE *x, TYPE *y, int n) const {
    if (comm_size > 1) {
        call_minScan(x, y, n);
    } else {
        for (int i = 0; i < n; i++)
            y[i] = x[i];
    }
}
// Define specializations of call_minScan(const TYPE*, TYPE*, int)
#if defined(USE_MPI) || defined(USE_EXT_MPI)
template <>
void MPI_CLASS::call_minScan<unsigned char>(const unsigned char *,
                                            unsigned char *, int) const;
template <> void MPI_CLASS::call_minScan<char>(const char *, char *, int) const;
template <>
void MPI_CLASS::call_minScan<unsigned int>(const unsigned int *, unsigned int *,
                                           int) const;
template <> void MPI_CLASS::call_minScan<int>(const int *, int *, int) const;
template <>
void MPI_CLASS::call_minScan<unsigned long int>(const unsigned long int *,
                                                unsigned long int *, int) const;
template <>
void MPI_CLASS::call_minScan<long int>(const long int *, long int *, int) const;
template <>
void MPI_CLASS::call_minScan<size_t>(const size_t *, size_t *, int) const;
template <>
void MPI_CLASS::call_minScan<float>(const float *, float *, int) const;
template <>
void MPI_CLASS::call_minScan<double>(const double *, double *, int) const;
#endif
// Default instantiations of call_minScan(const TYPE*, TYPE*, int)
template <class TYPE>
void MPI_CLASS::call_minScan(const TYPE *, TYPE *, int) const {
    char message[200];
    sprintf(message,
            "Default instantion of minScan in parallel is not supported (%s)",
            typeid(TYPE).name());
    MPI_CLASS_ERROR(message);
}

/************************************************************************
 *  maxScan                                                              *
 ************************************************************************/
template <class TYPE>
inline void MPI_CLASS::maxScan(const TYPE *x, TYPE *y, int n) const {
    if (comm_size > 1) {
        call_maxScan(x, y, n);
    } else {
        for (int i = 0; i < n; i++)
            y[i] = x[i];
    }
}
// Define specializations of call_maxScan(const TYPE*, TYPE*, int)
#if defined(USE_MPI) || defined(USE_EXT_MPI)
template <>
void MPI_CLASS::call_maxScan<unsigned char>(const unsigned char *,
                                            unsigned char *, int) const;
template <> void MPI_CLASS::call_maxScan<char>(const char *, char *, int) const;
template <>
void MPI_CLASS::call_maxScan<unsigned int>(const unsigned int *, unsigned int *,
                                           int) const;
template <> void MPI_CLASS::call_maxScan<int>(const int *, int *, int) const;
template <>
void MPI_CLASS::call_maxScan<unsigned long int>(const unsigned long int *,
                                                unsigned long int *, int) const;
template <>
void MPI_CLASS::call_maxScan<long int>(const long int *, long int *, int) const;
template <>
void MPI_CLASS::call_maxScan<size_t>(const size_t *, size_t *, int) const;
template <>
void MPI_CLASS::call_maxScan<float>(const float *, float *, int) const;
template <>
void MPI_CLASS::call_maxScan<double>(const double *, double *, int) const;
#endif
// Default instantiations of call_maxScan(const TYPE*, TYPE*, int)
template <class TYPE>
void MPI_CLASS::call_maxScan(const TYPE *, TYPE *, int) const {
    char message[200];
    sprintf(message,
            "Default instantion of maxReduce in parallel is not supported (%s)",
            typeid(TYPE).name());
    MPI_CLASS_ERROR(message);
}

/************************************************************************
 *  allToAll                                                             *
 ************************************************************************/
// Define specializations of allToAll( int n, const char*, char* )
#if defined(USE_MPI) || defined(USE_EXT_MPI)
template <>
void MPI_CLASS::allToAll<unsigned char>(int n, const unsigned char *,
                                        unsigned char *) const;
template <> void MPI_CLASS::allToAll<char>(int n, const char *, char *) const;
template <>
void MPI_CLASS::allToAll<unsigned int>(int n, const unsigned int *,
                                       unsigned int *) const;
template <> void MPI_CLASS::allToAll<int>(int n, const int *, int *) const;
template <>
void MPI_CLASS::allToAll<unsigned long int>(int n, const unsigned long int *,
                                            unsigned long int *) const;
template <>
void MPI_CLASS::allToAll<long int>(int n, const long int *, long int *) const;
template <>
void MPI_CLASS::allToAll<float>(int n, const float *, float *) const;
template <>
void MPI_CLASS::allToAll<double>(int n, const double *, double *) const;
#endif
// Default instantiations of allToAll( int n, const char*, char* )
#if defined(USE_MPI) || defined(USE_EXT_MPI)
template <class TYPE>
void MPI_CLASS::allToAll(int n, const TYPE *send_data, TYPE *recv_data) const {
    static_assert(is_mpi_copyable<TYPE>(), "Object is not trivially copyable");
    allToAll<char>(n * sizeof(TYPE), (char *)send_data, (char *)recv_data);
}
#else
template <class TYPE>
void MPI_CLASS::allToAll(int n, const TYPE *send_data, TYPE *recv_data) const {
    if (comm_size != 1)
        MPI_CLASS_ERROR("Invalid size for allToAll");
    for (int i = 0; i < n; i++)
        recv_data[i] = send_data[i];
}
#endif

/************************************************************************
 *  allToAll                                                             *
 ************************************************************************/
template <class TYPE>
int MPI_CLASS::allToAll(const TYPE *send_data, const int send_cnt[],
                        const int send_disp[], TYPE *recv_data, int *recv_cnt,
                        int *recv_disp, bool known_recv) const {
    int N_recieved = 0;
    if (comm_size == 1) {
        // Special case for single-processor communicators
        if (known_recv) {
            if (recv_cnt[0] != send_cnt[0] && send_cnt[0] > 0)
                MPI_CLASS_ERROR(
                    "Single processor send/recv are different sizes");
        } else {
            if (recv_cnt != nullptr)
                recv_cnt[0] = send_cnt[0];
            if (recv_disp != nullptr)
                recv_disp[0] = send_disp[0];
        }
        for (int i = 0; i < send_cnt[0]; i++)
            recv_data[i + recv_disp[0]] = send_data[i + send_disp[0]];
        N_recieved = send_cnt[0];
    } else if (known_recv) {
        // The recieve sizes are known
        MPI_CLASS_ASSERT(recv_cnt != nullptr && recv_disp != nullptr);
        call_allToAll(send_data, send_cnt, send_disp, recv_data, recv_cnt,
                      recv_disp);
        for (int i = 0; i < comm_size; i++)
            N_recieved += recv_cnt[i];
    } else {
        // The recieve sizes are not known, we need to communicate that information first
        int *recv_cnt2 = recv_cnt;
        int *recv_disp2 = recv_disp;
        if (recv_cnt == nullptr)
            recv_cnt2 = new int[comm_size];
        if (recv_disp == nullptr)
            recv_disp2 = new int[comm_size];
        // Communicate the size we will be recieving from each processor
        allToAll<int>(1, send_cnt, recv_cnt2);
        recv_disp2[0] = 0;
        for (int i = 1; i < comm_size; i++)
            recv_disp2[i] = recv_disp2[i - 1] + recv_cnt2[i - 1];
        // Send the data
        call_allToAll(send_data, send_cnt, send_disp, recv_data, recv_cnt2,
                      recv_disp2);
        for (int i = 0; i < comm_size; i++)
            N_recieved += recv_cnt2[i];
        if (recv_cnt == nullptr)
            delete[] recv_cnt2;
        if (recv_disp == nullptr)
            delete[] recv_disp2;
    }
    return N_recieved;
}
// Define specializations of call_allToAll
#if defined(USE_MPI) || defined(USE_EXT_MPI)
template <>
void MPI_CLASS::call_allToAll<unsigned char>(const unsigned char *, const int *,
                                             const int *, unsigned char *,
                                             const int *, const int *) const;
template <>
void MPI_CLASS::call_allToAll<char>(const char *, const int *, const int *,
                                    char *, const int *, const int *) const;
template <>
void MPI_CLASS::call_allToAll<unsigned int>(const unsigned int *, const int *,
                                            const int *, unsigned int *,
                                            const int *, const int *) const;
template <>
void MPI_CLASS::call_allToAll<int>(const int *, const int *, const int *, int *,
                                   const int *, const int *) const;
template <>
void MPI_CLASS::call_allToAll<unsigned long int>(const unsigned long int *,
                                                 const int *, const int *,
                                                 unsigned long int *,
                                                 const int *,
                                                 const int *) const;
template <>
void MPI_CLASS::call_allToAll<long int>(const long int *, const int *,
                                        const int *, long int *, const int *,
                                        const int *) const;
template <>
void MPI_CLASS::call_allToAll<float>(const float *, const int *, const int *,
                                     float *, const int *, const int *) const;
template <>
void MPI_CLASS::call_allToAll<double>(const double *, const int *, const int *,
                                      double *, const int *, const int *) const;
#else
template <>
void MPI_CLASS::call_allToAll<char>(const char *, const int *, const int *,
                                    char *, const int *, const int *) const;
#endif
// Default instantiations of call_allToAll
template <class TYPE>
void MPI_CLASS::call_allToAll(const TYPE *send_data, const int send_cnt[],
                              const int send_disp[], TYPE *recv_data,
                              const int *recv_cnt, const int *recv_disp) const {
    int *send_cnt2 = new int[comm_size];
    int *recv_cnt2 = new int[comm_size];
    int *send_disp2 = new int[comm_size];
    int *recv_disp2 = new int[comm_size];
    for (int i = 0; i < comm_size; i++) {
        send_cnt2[i] = send_cnt[i] * sizeof(TYPE);
        send_disp2[i] = send_disp[i] * sizeof(TYPE);
        recv_cnt2[i] = recv_cnt[i] * sizeof(TYPE);
        recv_disp2[i] = recv_disp[i] * sizeof(TYPE);
    }
    static_assert(is_mpi_copyable<TYPE>(), "Object is not trivially copyable");
    call_allToAll<char>((char *)send_data, send_cnt2, send_disp2,
                        (char *)recv_data, recv_cnt2, recv_disp2);
    delete[] send_cnt2;
    delete[] recv_cnt2;
    delete[] send_disp2;
    delete[] recv_disp2;
}

} // namespace Utilities

#endif