LBPM/common/MPI.cpp

/*
  Copyright 2013--2018 James E. McClure, Virginia Polytechnic & State University
  Copyright Equnior ASA

  This file is part of the Open Porous Media project (OPM).
  OPM is free software: you can redistribute it and/or modify
  it under the terms of the GNU General Public License as published by
  the Free Software Foundation, either version 3 of the License, or
  (at your option) any later version.
  OPM is distributed in the hope that it will be useful,
  but WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  GNU General Public License for more details.
  You should have received a copy of the GNU General Public License
  along with OPM.  If not, see <http://www.gnu.org/licenses/>.
*/
// This file impliments a wrapper class for MPI functions

#include "common/MPI.h"
#include "common/Utilities.h"
#include "common/Utilities.hpp"

#include "ProfilerApp.h"
#include "StackTrace/ErrorHandlers.h"
#include "StackTrace/StackTrace.h"

// Include all other headers
#include <algorithm>
#include <chrono>
#include <climits>
#include <cstdio>
#include <cstdlib>
#include <cstring>
#include <limits>
#include <random>
#include <stdexcept>
#include <thread>
#include <typeinfo>

// Include OS specific headers
#undef USE_WINDOWS
#undef USE_LINUX
#undef USE_MAC
#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
// We are using windows
#define USE_WINDOWS
#include <process.h>
#include <windows.h>
#define sched_yield() Sleep(0)
#elif defined(__APPLE__)
// Using MAC
#define USE_MAC
#include <sched.h>
#elif defined(__linux) || defined(__linux__) || defined(__unix) ||             \
    defined(__posix)
// We are using linux
#define USE_LINUX
#include <sched.h>
#include <unistd.h>
#else
#error Unknown OS
#endif

// Convience defines
#define MPI_ERROR ERROR
#define MPI_ASSERT ASSERT
#define MPI_INSIST INSIST
#define MPI_WARNING WARNING
#define MPI_CLASS_COMM_NULL MPI_COMM_NULL
#define MPI_CLASS_COMM_SELF MPI_COMM_SELF
#define MPI_CLASS_COMM_WORLD MPI_COMM_WORLD

// Global variable to track create new unique comms (dup and split)
#ifndef USE_MPI
MPI_Comm uniqueGlobalComm = 11;
#endif

#if defined(USE_SAMRAI) && defined(USE_PETSC) && !defined(USE_MPI)
int MPI_REQUEST_NULL = 3;
int MPI_ERR_IN_STATUS = 4;
#endif

namespace Utilities {

// Some special structs to work with MPI
#ifdef USE_MPI
struct IntIntStruct {
    int j;
    int i;
};
struct LongIntStruct {
    long int j;
    int i;
};
struct FloatIntStruct {
    float f;
    int i;
};
struct DoubleIntStruct {
    double d;
    int i;
};
#endif

// Initialized the static member variables
volatile unsigned int MPI_CLASS::N_MPI_Comm_created = 0;
volatile unsigned int MPI_CLASS::N_MPI_Comm_destroyed = 0;
short MPI_CLASS::profile_level = 127;

// Define a type for use with size_t
#ifdef USE_MPI
static MPI_Datatype MPI_SIZE_T = 0x0;
static MPI_Datatype getSizeTDataType() {
    int size_int, size_long, size_longlong, size_longlong2;
    MPI_Type_size(MPI_UNSIGNED, &size_int);
    MPI_Type_size(MPI_UNSIGNED_LONG, &size_long);
    MPI_Type_size(MPI_UNSIGNED_LONG_LONG, &size_longlong);
    MPI_Type_size(MPI_LONG_LONG_INT, &size_longlong2);
    if (sizeof(size_t) == size_int) {
        return MPI_UNSIGNED;
    } else if (sizeof(size_t) == size_long) {
        return MPI_UNSIGNED_LONG;
    } else if (sizeof(size_t) == size_longlong) {
        return MPI_UNSIGNED_LONG_LONG;
    } else if (sizeof(size_t) == size_longlong2) {
        MPI_WARNING("Using signed long long datatype for size_t in MPI");
        return MPI_LONG_LONG_INT; // Note: this is not unsigned
    } else {
        MPI_ERROR("No suitable datatype found");
    }
    return 0;
}
#endif

// Static data for asyncronous communication without MPI
// Note: these routines may not be thread-safe yet
#ifndef USE_MPI
static const int mpi_max_tag = 0x003FFFFF;
struct Isendrecv_struct {
    const char *data; // Pointer to data
    int status;       // Status: 1-sending, 2-recieving
};
std::map<MPI_Request, Isendrecv_struct> global_isendrecv_list;
static MPI_Request getRequest(MPI_Comm comm, int tag) {
    MPI_ASSERT(tag >= 0 && tag <= mpi_max_tag);
    // Use hashing function: 2^64*0.5*(sqrt(5)-1)
    uint64_t a = static_cast<uint8_t>(comm) * 0x9E3779B97F4A7C15;
    uint64_t b = static_cast<uint8_t>(tag) * 0x9E3779B97F4A7C15;
    uint64_t hash = a ^ b;
    MPI_Request request;
    memcpy(&request, &hash, sizeof(MPI_Request));
    return request;
}
#endif

// Check the mpi error code
#ifdef USE_MPI
inline void check_MPI(int error) {
    if (error != MPI_SUCCESS)
        MPI_ERROR("Error calling MPI routine");
}
#endif

/******************************************************************
 * Some helper functions to convert between signed/unsigned types  *
 ******************************************************************/
DISABLE_WARNINGS
static inline constexpr unsigned int offset_int() {
    return ~static_cast<unsigned int>(std::numeric_limits<int>::min()) + 1;
}
static inline constexpr unsigned long int offset_long() {
    return ~static_cast<long int>(std::numeric_limits<long int>::min()) + 1;
}
static inline constexpr unsigned long long int offset_long_long() {
    return ~static_cast<long long int>(
               std::numeric_limits<long long int>::min()) +
           1;
}
ENABLE_WARNINGS
static inline unsigned int signed_to_unsigned(int x) {
    const auto offset = offset_int();
    return (x >= 0) ? static_cast<unsigned int>(x) + offset
                    : offset - static_cast<unsigned int>(-x);
}
static inline unsigned long int signed_to_unsigned(long int x) {
    const auto offset = offset_long();
    return (x >= 0) ? static_cast<unsigned long int>(x) + offset
                    : offset - static_cast<unsigned long int>(-x);
}
static inline unsigned long long int signed_to_unsigned(long long int x) {
    const auto offset = offset_long_long();
    return (x >= 0) ? static_cast<unsigned long long int>(x) + offset
                    : offset - static_cast<unsigned long long int>(-x);
}
static inline int unsigned_to_signed(unsigned int x) {
    const auto offset = offset_int();
    return (x >= offset) ? static_cast<int>(x - offset)
                         : -static_cast<int>(offset - x);
}
static inline long int unsigned_to_signed(unsigned long int x) {
    const auto offset = offset_long();
    return (x >= offset) ? static_cast<long int>(x - offset)
                         : -static_cast<long int>(offset - x);
}
static inline long long int unsigned_to_signed(unsigned long long int x) {
    const auto offset = offset_long_long();
    return (x >= offset) ? static_cast<long long int>(x - offset)
                         : -static_cast<long long int>(offset - x);
}

/************************************************************************
 *  Get the MPI version                                                  *
 ************************************************************************/
std::array<int, 2> MPI_CLASS::version() {
#ifdef USE_MPI
    int MPI_version;
    int MPI_subversion;
    MPI_Get_version(&MPI_version, &MPI_subversion);
    return {MPI_version, MPI_subversion};
#else
    return {0, 0};
#endif
}
std::string MPI_CLASS::info() {
#ifdef USE_MPI
#if MPI_VERSION >= 3
    int MPI_version_length = 0;
    char MPI_version_string[MPI_MAX_LIBRARY_VERSION_STRING];
    MPI_Get_library_version(MPI_version_string, &MPI_version_length);
    if (MPI_version_length > 0) {
        std::string MPI_info(MPI_version_string, MPI_version_length);
        size_t pos = MPI_info.find('\n');
        while (pos != std::string::npos) {
            MPI_info.insert(pos + 1, "   ");
            pos = MPI_info.find('\n', pos + 1);
        }
        return MPI_info;
    }
#endif
    auto tmp = version();
    return std::to_string(tmp[0]) + "." + std::to_string(tmp[0]);
#else
    return std::string();
#endif
}

/************************************************************************
 *  Functions to get/set the process affinities                          *
 ************************************************************************/
int MPI_CLASS::getNumberOfProcessors() {
    return std::thread::hardware_concurrency();
}
std::vector<int> MPI_CLASS::getProcessAffinity() {
    std::vector<int> procs;
#ifdef USE_LINUX
    cpu_set_t mask;
    int error = sched_getaffinity(getpid(), sizeof(cpu_set_t), &mask);
    if (error != 0)
        MPI_ERROR("Error getting process affinity");
    for (int i = 0; i < (int)sizeof(cpu_set_t) * CHAR_BIT; i++) {
        if (CPU_ISSET(i, &mask))
            procs.push_back(i);
    }
#elif defined(USE_MAC)
    // MAC does not support getting or setting the affinity
    printf("Warning: MAC does not support getting the process affinity\n");
    procs.clear();
#elif defined(USE_WINDOWS)
    HANDLE hProc = GetCurrentProcess();
    size_t procMask;
    size_t sysMask;
    PDWORD_PTR procMaskPtr = reinterpret_cast<PDWORD_PTR>(&procMask);
    PDWORD_PTR sysMaskPtr = reinterpret_cast<PDWORD_PTR>(&sysMask);
    GetProcessAffinityMask(hProc, procMaskPtr, sysMaskPtr);
    for (int i = 0; i < (int)sizeof(size_t) * CHAR_BIT; i++) {
        if ((procMask & 0x1) != 0)
            procs.push_back(i);
        procMask >>= 1;
    }
#else
#error Unknown OS
#endif
    return procs;
}
void MPI_CLASS::setProcessAffinity(const std::vector<int> &procs) {
#ifdef USE_LINUX
    cpu_set_t mask;
    CPU_ZERO(&mask);
    for (auto cpu : procs)
        CPU_SET(cpu, &mask);
    int error = sched_setaffinity(getpid(), sizeof(cpu_set_t), &mask);
    if (error != 0)
        MPI_ERROR("Error setting process affinity");
#elif defined(USE_MAC)
    // MAC does not support getting or setting the affinity
    NULL_USE(procs);
#elif defined(USE_WINDOWS)
    DWORD mask = 0;
    for (size_t i = 0; i < procs.size(); i++)
        mask |= ((DWORD)1) << procs[i];
    HANDLE hProc = GetCurrentProcess();
    SetProcessAffinityMask(hProc, mask);
#else
#error Unknown OS
#endif
}

/************************************************************************
 *  Function to check if MPI is active                                   *
 ************************************************************************/
bool MPI_CLASS::MPI_active() {
#ifdef USE_MPI
    int initialized = 0, finalized = 0;
    MPI_Initialized(&initialized);
    MPI_Finalized(&finalized);
    return initialized != 0 && finalized == 0;
#else
    return true;
#endif
}
MPI_CLASS::ThreadSupport MPI_CLASS::queryThreadSupport() {
#ifdef USE_MPI
    int provided = 0;
    MPI_Query_thread(&provided);
    if (provided == MPI_THREAD_SINGLE)
        return ThreadSupport::SINGLE;
    if (provided == MPI_THREAD_FUNNELED)
        return ThreadSupport::FUNNELED;
    if (provided == MPI_THREAD_SERIALIZED)
        return ThreadSupport::SERIALIZED;
    if (provided == MPI_THREAD_MULTIPLE)
        return ThreadSupport::MULTIPLE;
    return ThreadSupport::SINGLE;
#else
    return ThreadSupport::MULTIPLE;
#endif
}

/************************************************************************
 *  Function to perform a load balance of the given processes            *
 ************************************************************************/
void MPI_CLASS::balanceProcesses(const MPI_CLASS &globalComm, const int method,
                                 const std::vector<int> &procs,
                                 const int N_min_in, const int N_max_in) {
    // Build the list of processors to use
    std::vector<int> cpus = procs;
    if (cpus.empty()) {
        for (int i = 0; i < getNumberOfProcessors(); i++)
            cpus.push_back(i);
    }
    // Handle the "easy cases"
    if (method == 1) {
        // Trivial case where we do not need any communication
        setProcessAffinity(cpus);
        return;
    }
    // Get the sub-communicator for the current node
    MPI_CLASS nodeComm = globalComm.splitByNode();
    int N_min = std::min<int>(std::max<int>(N_min_in, 1), cpus.size());
    int N_max = N_max_in;
    if (N_max == -1)
        N_max = cpus.size();
    N_max = std::min<int>(N_max, cpus.size());
    MPI_ASSERT(N_max >= N_min);
    // Perform the load balance within the node
    if (method == 2) {
        int N_proc = cpus.size() / nodeComm.getSize();
        N_proc = std::max<int>(N_proc, N_min);
        N_proc = std::min<int>(N_proc, N_max);
        std::vector<int> cpus2(N_proc, -1);
        for (int i = 0; i < N_proc; i++)
            cpus2[i] = cpus[(nodeComm.getRank() * N_proc + i) % cpus.size()];
        setProcessAffinity(cpus2);
    } else {
        MPI_ERROR("Unknown method for load balance");
    }
}

/************************************************************************
 *  Empty constructor                                                    *
 ************************************************************************/
MPI_CLASS::MPI_CLASS() {
// Initialize the data members to a defaul communicator of self
#ifdef USE_MPI
    communicator = MPI_COMM_NULL;
    d_maxTag = 0x7FFFFFFF;
#else
    communicator = MPI_CLASS_COMM_NULL;
    d_maxTag = mpi_max_tag;
#endif
    d_count = nullptr;
    d_manage = false;
    comm_rank = 0;
    comm_size = 1;
    d_isNull = true;
    d_currentTag = nullptr;
    d_call_abort = true;
    tmp_alignment = -1;
}

/************************************************************************
 *  Empty deconstructor                                                  *
 ************************************************************************/
MPI_CLASS::~MPI_CLASS() { reset(); }
void MPI_CLASS::reset() {
    // Decrement the count if used
    int count = -1;
    if (d_count != nullptr)
        count = --(*d_count);
    if (count == 0) {
        // We are holding that last reference to the MPI_Comm object, we need to free it
        if (d_manage) {
#ifdef USE_MPI
            MPI_Comm_set_errhandler(communicator, MPI_ERRORS_ARE_FATAL);
            int err = MPI_Comm_free(&communicator);
            if (err != MPI_SUCCESS)
                MPI_ERROR("Problem free'ing MPI_Comm object");
            communicator = MPI_CLASS_COMM_NULL;
            ++N_MPI_Comm_destroyed;
#endif
        }
        delete d_count;
    }
    if (d_currentTag == nullptr) {
        // No tag index
    } else if (d_currentTag[1] > 1) {
        --(d_currentTag[1]);
    } else {
        delete[] d_currentTag;
    }
    d_manage = false;
    d_count = nullptr;
    comm_rank = 0;
    comm_size = 1;
    d_maxTag = 0;
    d_isNull = true;
    d_currentTag = nullptr;
    d_call_abort = true;
}

/************************************************************************
 *  Copy constructors                                                    *
 ************************************************************************/
MPI_CLASS::MPI_CLASS(const MPI_CLASS &comm)
    : communicator(comm.communicator), d_isNull(comm.d_isNull),
      d_manage(comm.d_manage), comm_rank(comm.comm_rank),
      comm_size(comm.comm_size), d_maxTag(comm.d_maxTag),
      d_currentTag(comm.d_currentTag) {
    // Initialize the data members to the existing comm object
    if (d_currentTag != nullptr)
        ++d_currentTag[1];
    d_call_abort = comm.d_call_abort;
    // Set and increment the count
    d_count = comm.d_count;
    if (d_count != nullptr)
        ++(*d_count);
    tmp_alignment = -1;
}
MPI_CLASS::MPI_CLASS(MPI_CLASS &&rhs) : MPI_CLASS() {
    std::swap(communicator, rhs.communicator);
    std::swap(d_isNull, rhs.d_isNull);
    std::swap(d_manage, rhs.d_manage);
    std::swap(d_call_abort, rhs.d_call_abort);
    std::swap(profile_level, rhs.profile_level);
    std::swap(comm_rank, rhs.comm_rank);
    std::swap(comm_size, rhs.comm_size);
    std::swap(d_maxTag, rhs.d_maxTag);
    std::swap(d_currentTag, rhs.d_currentTag);
    std::swap(d_count, rhs.d_count);
    std::swap(tmp_alignment, rhs.tmp_alignment);
}

/************************************************************************
 *  Assignment operators                                                 *
 ************************************************************************/
MPI_CLASS &MPI_CLASS::operator=(const MPI_CLASS &comm) {
    if (this == &comm) // protect against invalid self-assignment
        return *this;
    // Destroy the previous object
    this->reset();
    // Initialize the data members to the existing object
    this->communicator = comm.communicator;
    this->comm_rank = comm.comm_rank;
    this->comm_size = comm.comm_size;
    this->d_isNull = comm.d_isNull;
    this->d_manage = comm.d_manage;
    this->d_maxTag = comm.d_maxTag;
    this->d_call_abort = comm.d_call_abort;
    this->d_currentTag = comm.d_currentTag;
    if (this->d_currentTag != nullptr)
        ++(this->d_currentTag[1]);
    // Set and increment the count
    this->d_count = comm.d_count;
    if (this->d_count != nullptr)
        ++(*d_count);
    this->tmp_alignment = -1;
    return *this;
}
MPI_CLASS &MPI_CLASS::operator=(MPI_CLASS &&rhs) {
    if (this == &rhs) // protect against invalid self-assignment
        return *this;
    std::swap(communicator, rhs.communicator);
    std::swap(d_isNull, rhs.d_isNull);
    std::swap(d_manage, rhs.d_manage);
    std::swap(d_call_abort, rhs.d_call_abort);
    std::swap(profile_level, rhs.profile_level);
    std::swap(comm_rank, rhs.comm_rank);
    std::swap(comm_size, rhs.comm_size);
    std::swap(d_maxTag, rhs.d_maxTag);
    std::swap(d_currentTag, rhs.d_currentTag);
    std::swap(d_count, rhs.d_count);
    std::swap(tmp_alignment, rhs.tmp_alignment);
    return *this;
}

/************************************************************************
 *  Constructor from existing MPI communicator                           *
 ************************************************************************/
int d_global_currentTag_world1[2] = {1, 1};
int d_global_currentTag_world2[2] = {1, 1};
int d_global_currentTag_self[2] = {1, 1};
#ifdef USE_MPI
std::atomic_int d_global_count_world1 = {1};
std::atomic_int d_global_count_world2 = {1};
std::atomic_int d_global_count_self = {1};
#endif
MPI_CLASS::MPI_CLASS(MPI_Comm comm, bool manage) {
    d_count = nullptr;
    d_manage = false;
    tmp_alignment = -1;
    // Check if we are using our version of comm_world
    if (comm == MPI_CLASS_COMM_WORLD) {
        communicator = MPI_COMM_WORLD;
    } else if (comm == MPI_CLASS_COMM_SELF) {
        communicator = MPI_COMM_SELF;
    } else if (comm == MPI_CLASS_COMM_NULL) {
        communicator = MPI_COMM_NULL;
    } else {
        communicator = comm;
    }
#ifdef USE_MPI
    // We are using MPI, use the MPI communicator to initialize the data
    if (communicator != MPI_COMM_NULL) {
        // Set the MPI_SIZE_T datatype if it has not been set
        if (MPI_SIZE_T == 0x0)
            MPI_SIZE_T = getSizeTDataType();
        // Attach the error handler
        StackTrace::setMPIErrorHandler(communicator);
        // Get the communicator properties
        MPI_Comm_rank(communicator, &comm_rank);
        MPI_Comm_size(communicator, &comm_size);
        int flag, *val;
        int ierr = MPI_Comm_get_attr(communicator, MPI_TAG_UB, &val, &flag);
        MPI_ASSERT(ierr == MPI_SUCCESS);
        if (flag == 0) {
            d_maxTag =
                0x7FFFFFFF; // The tag is not a valid attribute (set to 2^31-1)
        } else {
            d_maxTag = *val;
            if (d_maxTag < 0) {
                d_maxTag = 0x7FFFFFFF;
            } // The maximum tag is > a signed int (set to 2^31-1)
            MPI_INSIST(d_maxTag >= 0x7FFF,
                       "maximum tag size is < MPI standard");
        }
    } else {
        comm_rank = 1;
        comm_size = 0;
        d_maxTag = 0x7FFFFFFF;
    }
    d_isNull = communicator == MPI_COMM_NULL;
    if (manage && communicator != MPI_COMM_NULL &&
        communicator != MPI_COMM_SELF && communicator != MPI_COMM_WORLD)
        d_manage = true;
    // Create the count (Note: we do not need to worry about thread safety)
    if (communicator == MPI_CLASS_COMM_WORLD) {
        d_count = &d_global_count_world1;
        ++(*d_count);
    } else if (communicator == MPI_COMM_WORLD) {
        d_count = &d_global_count_world2;
        ++(*d_count);
    } else if (communicator == MPI_COMM_SELF) {
        d_count = &d_global_count_self;
        ++(*d_count);
    } else if (communicator == MPI_COMM_NULL) {
        d_count = nullptr;
    } else {
        d_count = new std::atomic_int;
        *d_count = 1;
    }
    if (d_manage)
        ++N_MPI_Comm_created;

#else
    // We are not using MPI, intialize based on the communicator
    NULL_USE(manage);
    comm_rank = 0;
    comm_size = 1;
    d_maxTag = mpi_max_tag;
    d_isNull = communicator == MPI_COMM_NULL;
    if (d_isNull)
        comm_size = 0;
#endif
    if (communicator == MPI_CLASS_COMM_WORLD) {
        d_currentTag = d_global_currentTag_world1;
        ++(this->d_currentTag[1]);
    } else if (communicator == MPI_COMM_WORLD) {
        d_currentTag = d_global_currentTag_world2;
        ++(this->d_currentTag[1]);
    } else if (communicator == MPI_COMM_SELF) {
        d_currentTag = d_global_currentTag_self;
        ++(this->d_currentTag[1]);
    } else if (communicator == MPI_COMM_NULL) {
        d_currentTag = nullptr;
    } else {
        d_currentTag = new int[2];
        d_currentTag[0] = (d_maxTag <= 0x10000) ? 1 : 0x1FFF;
        d_currentTag[1] = 1;
    }
    d_call_abort = true;
}

/************************************************************************
 *  Return the ranks of the communicator in the global comm              *
 ************************************************************************/
std::vector<int> MPI_CLASS::globalRanks() const {
    if (d_isNull)
        return std::vector<int>();
#ifdef USE_MPI
    // Get my global rank and size if it has not been set
    static int globalRank = -1;
    static int globalSize = -1;
    if (globalRank == -1 && MPI_active()) {
        MPI_Comm_rank(MPI_CLASS_COMM_WORLD, &globalRank);
        MPI_Comm_size(MPI_CLASS_COMM_WORLD, &globalSize);
    }
    // Check if we are dealing with a serial or global communicator
    if (comm_size == 1)
        return std::vector<int>(1, globalRank);
    if (comm_size == globalSize) {
        std::vector<int> ranks(globalSize);
        for (int i = 0; i < globalSize; i++)
            ranks[i] = i;
        return ranks;
    }
    // Get the global rank from each rank in the communicator
    auto ranks = allGather(globalRank);
    std::sort(ranks.begin(), ranks.end());
    return ranks;
#else
    return std::vector<int>(1, 1);
#endif
}

/************************************************************************
 *  Generate a random number                                             *
 ************************************************************************/
size_t MPI_CLASS::rand() const {
    size_t val = 0;
    if (getRank() == 0) {
        static std::random_device rd;
        static std::mt19937 gen(rd());
        static std::uniform_int_distribution<size_t> dist;
        val = dist(gen);
    }
    val = bcast(val, 0);
    return val;
}

/************************************************************************
 *  Intersect two communicators                                          *
 ************************************************************************/
#ifdef USE_MPI
static inline void MPI_Group_free2(MPI_Group *group) {
    if (*group != MPI_GROUP_EMPTY) {
        // MPICH is fine with free'ing an empty group, OpenMPI crashes
        MPI_Group_free(group);
    }
}
MPI_CLASS MPI_CLASS::intersect(const MPI_CLASS &comm1, const MPI_CLASS &comm2) {
    MPI_Group group1 = MPI_GROUP_EMPTY, group2 = MPI_GROUP_EMPTY;
    if (!comm1.isNull()) {
        MPI_Group_free2(&group1);
        MPI_Comm_group(comm1.communicator, &group1);
    }
    if (!comm2.isNull()) {
        MPI_Group_free2(&group2);
        MPI_Comm_group(comm2.communicator, &group2);
    }
    MPI_Group group12;
    MPI_Group_intersection(group1, group2, &group12);
    int compare1, compare2;
    MPI_Group_compare(group1, group12, &compare1);
    MPI_Group_compare(group2, group12, &compare2);
    MPI_CLASS new_comm(MPI_CLASS_COMM_NULL);
    int size;
    MPI_Group_size(group12, &size);
    if (compare1 != MPI_UNEQUAL && size != 0) {
        // The intersection matches comm1
        new_comm = comm1;
    } else if (compare2 != MPI_UNEQUAL && size != 0) {
        // The intersection matches comm2
        new_comm = comm2;
    } else if (comm1.isNull()) {
        // comm1 is null, we can return safely (comm1 is needed for communication)
    } else {
        // The intersection is smaller than comm1 or comm2
        // Check if the new comm is nullptr for all processors
        int max_size = 0;
        MPI_Allreduce(&size, &max_size, 1, MPI_INT, MPI_MAX,
                      comm1.communicator);
        if (max_size == 0) {
            // We are dealing with completely disjoint sets
            new_comm = MPI_CLASS(MPI_CLASS_COMM_NULL, false);
        } else {
            // Create the new comm
            // Note: OpenMPI crashes if the intersection group is EMPTY for any processors
            // We will set it to SELF for the EMPTY processors, then create a nullptr comm later
            if (group12 == MPI_GROUP_EMPTY) {
                MPI_Group_free2(&group12);
                MPI_Comm_group(MPI_COMM_SELF, &group12);
            }
            MPI_Comm new_MPI_comm;
            MPI_Comm_create(comm1.communicator, group12, &new_MPI_comm);
            if (size > 0) {
                // This is the valid case where we create a new intersection comm
                new_comm = MPI_CLASS(new_MPI_comm, true);
            } else {
                // We actually want a null comm for this communicator
                new_comm = MPI_CLASS(MPI_CLASS_COMM_NULL, false);
                MPI_Comm_free(&new_MPI_comm);
            }
        }
    }
    MPI_Group_free2(&group1);
    MPI_Group_free2(&group2);
    MPI_Group_free2(&group12);
    return new_comm;
}
#else
MPI_CLASS MPI_CLASS::intersect(const MPI_CLASS &comm1, const MPI_CLASS &comm2) {
    if (comm1.isNull() || comm2.isNull())
        return MPI_CLASS(MPI_CLASS_COMM_NULL, false);
    MPI_ASSERT(comm1.comm_size == 1 && comm2.comm_size == 1);
    return comm1;
}
#endif

/************************************************************************
 *  Split a comm						                                    *
 ************************************************************************/
MPI_CLASS MPI_CLASS::split(int color, int key) const {
    if (d_isNull) {
        return MPI_CLASS(MPI_CLASS_COMM_NULL);
    } else if (comm_size == 1) {
        if (color == -1)
            return MPI_CLASS(MPI_CLASS_COMM_NULL);
        return dup();
    }
    MPI_Comm new_MPI_comm = MPI_CLASS_COMM_NULL;
#ifdef USE_MPI
    // USE MPI to split the communicator
    if (color == -1) {
        check_MPI(
            MPI_Comm_split(communicator, MPI_UNDEFINED, key, &new_MPI_comm));
    } else {
        check_MPI(MPI_Comm_split(communicator, color, key, &new_MPI_comm));
    }
#endif
    // Create the new object
    NULL_USE(key);
    MPI_CLASS new_comm(new_MPI_comm, true);
    new_comm.d_call_abort = d_call_abort;
    return new_comm;
}
MPI_CLASS MPI_CLASS::splitByNode(int key) const {
    // Check if we are dealing with a single processor (trivial case)
    if (comm_size == 1)
        return this->split(0, 0);
    // Get the node name
    std::string name = MPI_CLASS::getNodeName();
    // Gather the names from all ranks
    std::vector<std::string> list(comm_size);
    allGather(name, &list[0]);
    // Create the colors
    std::vector<int> color(comm_size, -1);
    color[0] = 0;
    for (int i = 1; i < comm_size; i++) {
        const std::string tmp1 = list[i];
        for (int j = 0; j < i; j++) {
            const std::string tmp2 = list[j];
            if (tmp1 == tmp2) {
                color[i] = color[j];
                break;
            }
            color[i] = color[i - 1] + 1;
        }
    }
    MPI_CLASS new_comm = this->split(color[comm_rank], key);
    return new_comm;
}

/************************************************************************
 *  Duplicate an exisiting comm object                                   *
 ************************************************************************/
MPI_CLASS MPI_CLASS::dup() const {
    if (d_isNull)
        return MPI_CLASS(MPI_CLASS_COMM_NULL);
    MPI_Comm new_MPI_comm = communicator;
#if defined(USE_MPI) || defined(USE_PETSC)
    // USE MPI to duplicate the communicator
    MPI_Comm_dup(communicator, &new_MPI_comm);
#else
    new_MPI_comm = uniqueGlobalComm;
    uniqueGlobalComm++;
#endif
    // Create the new comm object
    MPI_CLASS new_comm(new_MPI_comm, true);
    new_comm.d_isNull = d_isNull;
    new_comm.d_call_abort = d_call_abort;
    return new_comm;
}

/************************************************************************
 *  Get the node name                                                    *
 ************************************************************************/
std::string MPI_CLASS::getNodeName() {
#ifdef USE_MPI
    int length;
    char name[MPI_MAX_PROCESSOR_NAME + 1];
    memset(name, 0, MPI_MAX_PROCESSOR_NAME + 1);
    MPI_Get_processor_name(name, &length);
    return std::string(name);
#else
    return "Node0";
#endif
}

/************************************************************************
 *  Overload operator ==                                                 *
 ************************************************************************/
bool MPI_CLASS::operator==(const MPI_CLASS &comm) const {
    return communicator == comm.communicator;
}

/************************************************************************
 *  Overload operator !=                                                 *
 ************************************************************************/
bool MPI_CLASS::operator!=(const MPI_CLASS &comm) const {
    return communicator != comm.communicator;
}

/************************************************************************
 *  Overload operator <                                                  *
 ************************************************************************/
bool MPI_CLASS::operator<(const MPI_CLASS &comm) const {
    MPI_ASSERT(!this->d_isNull && !comm.d_isNull);
    bool flag = true;
    // First check if either communicator is NULL
    if (this->d_isNull)
        return false;
    if (comm.d_isNull)
        flag = false;
    // Use compare to check if the comms are equal
    if (compare(comm) != 0)
        return false;
    // Check that the size of the other communicator is > the current communicator size
    if (comm_size >= comm.comm_size)
        flag = false;
// Check the union of the communicator groups
// this is < comm iff this group is a subgroup of comm's group
#ifdef USE_MPI
    MPI_Group group1 = MPI_GROUP_EMPTY, group2 = MPI_GROUP_EMPTY,
              group12 = MPI_GROUP_EMPTY;
    if (!d_isNull)
        MPI_Comm_group(communicator, &group1);
    if (!comm.d_isNull)
        MPI_Comm_group(comm.communicator, &group2);
    MPI_Group_union(group1, group2, &group12);
    int compare;
    MPI_Group_compare(group2, group12, &compare);
    if (compare == MPI_UNEQUAL)
        flag = false;
    MPI_Group_free(&group1);
    MPI_Group_free(&group2);
    MPI_Group_free(&group12);
#endif
    // Perform a global reduce of the flag (equivalent to all operation)
    return allReduce(flag);
}

/************************************************************************
 *  Overload operator <=                                                 *
 ************************************************************************/
bool MPI_CLASS::operator<=(const MPI_CLASS &comm) const {
    MPI_ASSERT(!this->d_isNull && !comm.d_isNull);
    bool flag = true;
    // First check if either communicator is NULL
    if (this->d_isNull)
        return false;
    if (comm.d_isNull)
        flag = false;
#ifdef USE_MPI
    int world_size = 0;
    MPI_Comm_size(MPI_COMM_WORLD, &world_size);
    if (comm.getSize() == world_size)
        return true;
    if (getSize() == 1 && !comm.d_isNull)
        return true;
#endif
    // Use compare to check if the comms are equal
    if (compare(comm) != 0)
        return true;
    // Check that the size of the other communicator is > the current communicator size
    // this is <= comm iff this group is a subgroup of comm's group
    if (comm_size > comm.comm_size)
        flag = false;
// Check the unnion of the communicator groups
#ifdef USE_MPI
    MPI_Group group1, group2, group12;
    MPI_Comm_group(communicator, &group1);
    MPI_Comm_group(comm.communicator, &group2);
    MPI_Group_union(group1, group2, &group12);
    int compare;
    MPI_Group_compare(group2, group12, &compare);
    if (compare == MPI_UNEQUAL)
        flag = false;
    MPI_Group_free(&group1);
    MPI_Group_free(&group2);
    MPI_Group_free(&group12);
#endif
    // Perform a global reduce of the flag (equivalent to all operation)
    return allReduce(flag);
}

/************************************************************************
 *  Overload operator >                                                  *
 ************************************************************************/
bool MPI_CLASS::operator>(const MPI_CLASS &comm) const {
    bool flag = true;
    // First check if either communicator is NULL
    if (this->d_isNull)
        return false;
    if (comm.d_isNull)
        flag = false;
    // Use compare to check if the comms are equal
    if (compare(comm) != 0)
        return false;
    // Check that the size of the other communicator is > the current communicator size
    if (comm_size <= comm.comm_size)
        flag = false;
// Check the unnion of the communicator groups
// this is > comm iff comm's group is a subgroup of this group
#ifdef USE_MPI
    MPI_Group group1 = MPI_GROUP_EMPTY, group2 = MPI_GROUP_EMPTY,
              group12 = MPI_GROUP_EMPTY;
    if (!d_isNull)
        MPI_Comm_group(communicator, &group1);
    if (!comm.d_isNull)
        MPI_Comm_group(comm.communicator, &group2);
    MPI_Group_union(group1, group2, &group12);
    int compare;
    MPI_Group_compare(group1, group12, &compare);
    if (compare == MPI_UNEQUAL)
        flag = false;
    MPI_Group_free(&group1);
    MPI_Group_free(&group2);
    MPI_Group_free(&group12);
#endif
    // Perform a global reduce of the flag (equivalent to all operation)
    return allReduce(flag);
}

/************************************************************************
 *  Overload operator >=                                                 *
 ************************************************************************/
bool MPI_CLASS::operator>=(const MPI_CLASS &comm) const {
    bool flag = true;
    // First check if either communicator is NULL
    if (this->d_isNull)
        return false;
    if (comm.d_isNull)
        flag = false;
#ifdef USE_MPI
    int world_size = 0;
    MPI_Comm_size(MPI_COMM_WORLD, &world_size);
    if (getSize() == world_size)
        return true;
    if (comm.getSize() == 1 && !comm.d_isNull)
        return true;
#endif
    // Use compare to check if the comms are equal
    if (compare(comm) != 0)
        return true;
    // Check that the size of the other communicator is > the current communicator size
    if (comm_size < comm.comm_size)
        flag = false;
// Check the unnion of the communicator groups
// this is >= comm iff comm's group is a subgroup of this group
#ifdef USE_MPI
    MPI_Group group1 = MPI_GROUP_EMPTY, group2 = MPI_GROUP_EMPTY,
              group12 = MPI_GROUP_EMPTY;
    if (!d_isNull)
        MPI_Comm_group(communicator, &group1);
    if (!comm.d_isNull)
        MPI_Comm_group(comm.communicator, &group2);
    MPI_Group_union(group1, group2, &group12);
    int compare;
    MPI_Group_compare(group1, group12, &compare);
    if (compare == MPI_UNEQUAL)
        flag = false;
    MPI_Group_free(&group1);
    MPI_Group_free(&group2);
    MPI_Group_free(&group12);
#endif
    // Perform a global reduce of the flag (equivalent to all operation)
    return allReduce(flag);
}

/************************************************************************
 *  Compare two comm objects                                             *
 ************************************************************************/
int MPI_CLASS::compare(const MPI_CLASS &comm) const {
    if (communicator == comm.communicator)
        return 1;
#ifdef USE_MPI
    if (d_isNull || comm.d_isNull)
        return 0;
    int result;
    check_MPI(MPI_Comm_compare(communicator, comm.communicator, &result));
    if (result == MPI_IDENT)
        return 2;
    else if (result == MPI_CONGRUENT)
        return 3;
    else if (result == MPI_SIMILAR)
        return 4;
    else if (result == MPI_UNEQUAL)
        return 0;
    MPI_ERROR("Unknown results from comm compare");
#else
    if (comm.communicator == MPI_COMM_NULL || communicator == MPI_COMM_NULL)
        return 0;
    else
        return 3;
#endif
    return 0;
}

/************************************************************************
 *  Abort the program.                                                   *
 ************************************************************************/
void MPI_CLASS::setCallAbortInSerialInsteadOfExit(bool flag) {
    d_call_abort = flag;
}
void MPI_CLASS::abort() const {
#ifdef USE_MPI
    MPI_Comm comm = communicator;
    if (comm == MPI_COMM_NULL)
        comm = MPI_COMM_WORLD;
    if (!MPI_active()) {
        // MPI is not availible
        exit(-1);
    } else if (comm_size > 1) {
        MPI_Abort(comm, -1);
    } else if (d_call_abort) {
        MPI_Abort(comm, -1);
    } else {
        exit(-1);
    }
#else
    exit(-1);
#endif
}

/************************************************************************
 *  newTag                                                               *
 ************************************************************************/
int MPI_CLASS::newTag() {
#ifdef USE_MPI
    // Syncronize the processes to ensure all ranks enter this call
    // Needed so the count will match
    barrier();
    // Return and increment the tag
    int tag = (*d_currentTag)++;
    MPI_INSIST(tag <= d_maxTag, "Maximum number of tags exceeded\n");
    return tag;
#else
    static int globalCurrentTag = 1;
    return globalCurrentTag++;
#endif
}

/************************************************************************
 *  allReduce                                                            *
 ************************************************************************/
bool MPI_CLASS::allReduce(const bool value) const {
    bool ret = value;
    if (comm_size > 1) {
#ifdef USE_MPI
        MPI_Allreduce((void *)&value, (void *)&ret, 1, MPI_UNSIGNED_CHAR,
                      MPI_MIN, communicator);
#else
        MPI_ERROR("This shouldn't be possible");
#endif
    }
    return ret;
}

/************************************************************************
 *  anyReduce                                                            *
 ************************************************************************/
bool MPI_CLASS::anyReduce(const bool value) const {
    bool ret = value;
    if (comm_size > 1) {
#ifdef USE_MPI
        MPI_Allreduce((void *)&value, (void *)&ret, 1, MPI_UNSIGNED_CHAR,
                      MPI_MAX, communicator);
#else
        MPI_ERROR("This shouldn't be possible");
#endif
    }
    return ret;
}

/************************************************************************
 *  call_sumReduce                                                       *
 *  Note: these specializations are only called when using MPI.          *
 ************************************************************************/
#ifdef USE_MPI
// unsigned char
template <>
void MPI_CLASS::call_sumReduce<unsigned char>(const unsigned char *send,
                                              unsigned char *recv,
                                              int n) const {
    PROFILE_START("sumReduce1<unsigned char>", profile_level);
    MPI_Allreduce((void *)send, (void *)recv, n, MPI_UNSIGNED_CHAR, MPI_SUM,
                  communicator);
    PROFILE_STOP("sumReduce1<unsigned char>", profile_level);
}
template <>
void MPI_CLASS::call_sumReduce<unsigned char>(unsigned char *x, int n) const {
    PROFILE_START("sumReduce2<unsigned char>", profile_level);
    auto send = x;
    auto recv = new unsigned char[n];
    MPI_Allreduce(send, recv, n, MPI_UNSIGNED_CHAR, MPI_SUM, communicator);
    for (int i = 0; i < n; i++)
        x[i] = recv[i];
    delete[] recv;
    PROFILE_STOP("sumReduce2<unsigned char>", profile_level);
}
// char
template <>
void MPI_CLASS::call_sumReduce<char>(const char *send, char *recv,
                                     int n) const {
    PROFILE_START("sumReduce1<char>", profile_level);
    MPI_Allreduce((void *)send, (void *)recv, n, MPI_SIGNED_CHAR, MPI_SUM,
                  communicator);
    PROFILE_STOP("sumReduce1<char>", profile_level);
}
template <> void MPI_CLASS::call_sumReduce<char>(char *x, int n) const {
    PROFILE_START("sumReduce2<char>", profile_level);
    auto send = x;
    auto recv = new char[n];
    MPI_Allreduce(send, recv, n, MPI_SIGNED_CHAR, MPI_SUM, communicator);
    for (int i = 0; i < n; i++)
        x[i] = recv[i];
    delete[] recv;
    PROFILE_STOP("sumReduce2<char>", profile_level);
}
// unsigned int
template <>
void MPI_CLASS::call_sumReduce<unsigned int>(const unsigned int *send,
                                             unsigned int *recv, int n) const {
    PROFILE_START("sumReduce1<unsigned int>", profile_level);
    MPI_Allreduce((void *)send, (void *)recv, n, MPI_UNSIGNED, MPI_SUM,
                  communicator);
    PROFILE_STOP("sumReduce1<unsigned int>", profile_level);
}
template <>
void MPI_CLASS::call_sumReduce<unsigned int>(unsigned int *x, int n) const {
    PROFILE_START("sumReduce2<unsigned int>", profile_level);
    auto send = x;
    auto recv = new unsigned int[n];
    MPI_Allreduce(send, recv, n, MPI_UNSIGNED, MPI_SUM, communicator);
    for (int i = 0; i < n; i++)
        x[i] = recv[i];
    delete[] recv;
    PROFILE_STOP("sumReduce2<unsigned int>", profile_level);
}
// int
template <>
void MPI_CLASS::call_sumReduce<int>(const int *send, int *recv, int n) const {
    PROFILE_START("sumReduce1<int>", profile_level);
    MPI_Allreduce((void *)send, (void *)recv, n, MPI_INT, MPI_SUM,
                  communicator);
    PROFILE_STOP("sumReduce1<int>", profile_level);
}
template <> void MPI_CLASS::call_sumReduce<int>(int *x, int n) const {
    PROFILE_START("sumReduce2<int>", profile_level);
    auto send = x;
    auto recv = new int[n];
    MPI_Allreduce(send, recv, n, MPI_INT, MPI_SUM, communicator);
    for (int i = 0; i < n; i++)
        x[i] = recv[i];
    delete[] recv;
    PROFILE_STOP("sumReduce2<int>", profile_level);
}
// long int
template <>
void MPI_CLASS::call_sumReduce<long int>(const long int *send, long int *recv,
                                         int n) const {
    PROFILE_START("sumReduce1<long int>", profile_level);
    MPI_Allreduce((void *)send, (void *)recv, n, MPI_LONG, MPI_SUM,
                  communicator);
    PROFILE_STOP("sumReduce1<long int>", profile_level);
}
template <> void MPI_CLASS::call_sumReduce<long int>(long int *x, int n) const {
    PROFILE_START("sumReduce2<long int>", profile_level);
    auto send = x;
    auto recv = new long int[n];
    MPI_Allreduce(send, recv, n, MPI_LONG, MPI_SUM, communicator);
    for (int i = 0; i < n; i++)
        x[i] = recv[i];
    delete[] recv;
    PROFILE_STOP("sumReduce2<long int>", profile_level);
}
// unsigned long int
template <>
void MPI_CLASS::call_sumReduce<unsigned long>(const unsigned long *send,
                                              unsigned long *recv,
                                              int n) const {
    PROFILE_START("sumReduce1<unsigned long>", profile_level);
    MPI_Allreduce((void *)send, (void *)recv, n, MPI_UNSIGNED_LONG, MPI_SUM,
                  communicator);
    PROFILE_STOP("sumReduce1<unsigned long>", profile_level);
}
template <>
void MPI_CLASS::call_sumReduce<unsigned long>(unsigned long *x, int n) const {
    PROFILE_START("sumReduce2<unsigned long>", profile_level);
    auto send = x;
    auto recv = new unsigned long int[n];
    MPI_Allreduce(send, recv, n, MPI_UNSIGNED_LONG, MPI_SUM, communicator);
    for (int i = 0; i < n; i++)
        x[i] = recv[i];
    delete[] recv;
    PROFILE_STOP("sumReduce2<unsigned long>", profile_level);
}
// size_t
#ifdef USE_WINDOWS
template <>
void MPI_CLASS::call_sumReduce<size_t>(const size_t *send, size_t *recv,
                                       int n) const {
    MPI_ASSERT(MPI_SIZE_T != 0);
    PROFILE_START("sumReduce1<size_t>", profile_level);
    MPI_Allreduce((void *)send, (void *)recv, n, MPI_SIZE_T, MPI_SUM,
                  communicator);
    PROFILE_STOP("sumReduce1<size_t>", profile_level);
}
template <> void MPI_CLASS::call_sumReduce<size_t>(size_t *x, int n) const {
    MPI_ASSERT(MPI_SIZE_T != 0);
    PROFILE_START("sumReduce2<size_t>", profile_level);
    auto send = x;
    auto recv = new size_t[n];
    MPI_Allreduce((void *)send, (void *)recv, n, MPI_SIZE_T, MPI_SUM,
                  communicator);
    for (int i = 0; i < n; i++)
        x[i] = recv[i];
    delete[] recv;
    PROFILE_STOP("sumReduce2<size_t>", profile_level);
}
#endif
// float
template <>
void MPI_CLASS::call_sumReduce<float>(const float *send, float *recv,
                                      int n) const {
    PROFILE_START("sumReduce1<float>", profile_level);
    MPI_Allreduce((void *)send, (void *)recv, n, MPI_FLOAT, MPI_SUM,
                  communicator);
    PROFILE_STOP("sumReduce1<float>", profile_level);
}
template <> void MPI_CLASS::call_sumReduce<float>(float *x, int n) const {
    PROFILE_START("sumReduce2<float>", profile_level);
    auto send = x;
    auto recv = new float[n];
    MPI_Allreduce(send, recv, n, MPI_FLOAT, MPI_SUM, communicator);
    for (int i = 0; i < n; i++)
        x[i] = recv[i];
    delete[] recv;
    PROFILE_STOP("sumReduce2<float>", profile_level);
}
// double
template <>
void MPI_CLASS::call_sumReduce<double>(const double *send, double *recv,
                                       int n) const {
    PROFILE_START("sumReduce1<double>", profile_level);
    MPI_Allreduce((void *)send, (void *)recv, n, MPI_DOUBLE, MPI_SUM,
                  communicator);
    PROFILE_STOP("sumReduce1<double>", profile_level);
}
template <> void MPI_CLASS::call_sumReduce<double>(double *x, int n) const {
    PROFILE_START("sumReduce2<double>", profile_level);
    auto send = x;
    auto recv = new double[n];
    MPI_Allreduce(send, recv, n, MPI_DOUBLE, MPI_SUM, communicator);
    for (int i = 0; i < n; i++)
        x[i] = recv[i];
    delete[] recv;
    PROFILE_STOP("sumReduce2<double>", profile_level);
}
// std::complex<double>
template <>
void MPI_CLASS::call_sumReduce<std::complex<double>>(
    const std::complex<double> *x, std::complex<double> *y, int n) const {
    PROFILE_START("sumReduce1<complex double>", profile_level);
    auto send = new double[2 * n];
    auto recv = new double[2 * n];
    for (int i = 0; i < n; i++) {
        send[2 * i + 0] = real(x[i]);
        send[2 * i + 1] = imag(x[i]);
    }
    MPI_Allreduce((void *)send, (void *)recv, 2 * n, MPI_DOUBLE, MPI_SUM,
                  communicator);
    for (int i = 0; i < n; i++)
        y[i] = std::complex<double>(recv[2 * i + 0], recv[2 * i + 1]);
    delete[] send;
    delete[] recv;
    PROFILE_STOP("sumReduce1<complex double>", profile_level);
}
template <>
void MPI_CLASS::call_sumReduce<std::complex<double>>(std::complex<double> *x,
                                                     int n) const {
    PROFILE_START("sumReduce2<complex double>", profile_level);
    auto send = new double[2 * n];
    auto recv = new double[2 * n];
    for (int i = 0; i < n; i++) {
        send[2 * i + 0] = real(x[i]);
        send[2 * i + 1] = imag(x[i]);
    }
    MPI_Allreduce(send, recv, 2 * n, MPI_DOUBLE, MPI_SUM, communicator);
    for (int i = 0; i < n; i++)
        x[i] = std::complex<double>(recv[2 * i + 0], recv[2 * i + 1]);
    delete[] send;
    delete[] recv;
    PROFILE_STOP("sumReduce2<complex double>", profile_level);
}
#endif

/************************************************************************
 *  call_minReduce                                                       *
 *  Note: these specializations are only called when using MPI.          *
 ************************************************************************/
#ifdef USE_MPI
// unsigned char
template <>
void MPI_CLASS::call_minReduce<unsigned char>(const unsigned char *send,
                                              unsigned char *recv, int n,
                                              int *comm_rank_of_min) const {
    if (comm_rank_of_min == nullptr) {
        PROFILE_START("minReduce1<unsigned char>", profile_level);
        MPI_Allreduce((void *)send, (void *)recv, n, MPI_UNSIGNED_CHAR, MPI_MIN,
                      communicator);
        PROFILE_STOP("minReduce1<unsigned char>", profile_level);
    } else {
        auto tmp = new int[n];
        for (int i = 0; i < n; i++)
            tmp[i] = send[i];
        call_minReduce<int>(tmp, n, comm_rank_of_min);
        for (int i = 0; i < n; i++)
            recv[i] = static_cast<unsigned char>(tmp[i]);
        delete[] tmp;
    }
}
template <>
void MPI_CLASS::call_minReduce<unsigned char>(unsigned char *x, int n,
                                              int *comm_rank_of_min) const {
    if (comm_rank_of_min == nullptr) {
        PROFILE_START("minReduce2<unsigned char>", profile_level);
        auto send = x;
        auto recv = new unsigned char[n];
        MPI_Allreduce(send, recv, n, MPI_UNSIGNED_CHAR, MPI_MIN, communicator);
        for (int i = 0; i < n; i++)
            x[i] = recv[i];
        delete[] recv;
        PROFILE_STOP("minReduce2<unsigned char>", profile_level);
    } else {
        auto tmp = new int[n];
        for (int i = 0; i < n; i++)
            tmp[i] = x[i];
        call_minReduce<int>(tmp, n, comm_rank_of_min);
        for (int i = 0; i < n; i++)
            x[i] = static_cast<unsigned char>(tmp[i]);
        delete[] tmp;
    }
}
// char
template <>
void MPI_CLASS::call_minReduce<char>(const char *send, char *recv, int n,
                                     int *comm_rank_of_min) const {
    if (comm_rank_of_min == nullptr) {
        PROFILE_START("minReduce1<char>", profile_level);
        MPI_Allreduce((void *)send, (void *)recv, n, MPI_SIGNED_CHAR, MPI_MIN,
                      communicator);
        PROFILE_STOP("minReduce1<char>", profile_level);
    } else {
        auto tmp = new int[n];
        for (int i = 0; i < n; i++)
            tmp[i] = send[i];
        call_minReduce<int>(tmp, n, comm_rank_of_min);
        for (int i = 0; i < n; i++)
            recv[i] = static_cast<char>(tmp[i]);
        delete[] tmp;
    }
}
template <>
void MPI_CLASS::call_minReduce<char>(char *x, int n,
                                     int *comm_rank_of_min) const {
    if (comm_rank_of_min == nullptr) {
        PROFILE_START("minReduce2<char>", profile_level);
        auto send = x;
        auto recv = new char[n];
        MPI_Allreduce(send, recv, n, MPI_SIGNED_CHAR, MPI_MIN, communicator);
        for (int i = 0; i < n; i++)
            x[i] = recv[i];
        delete[] recv;
        PROFILE_STOP("minReduce2<char>", profile_level);
    } else {
        auto tmp = new int[n];
        for (int i = 0; i < n; i++)
            tmp[i] = x[i];
        call_minReduce<int>(tmp, n, comm_rank_of_min);
        for (int i = 0; i < n; i++)
            x[i] = static_cast<char>(tmp[i]);
        delete[] tmp;
    }
}
// unsigned int
template <>
void MPI_CLASS::call_minReduce<unsigned int>(const unsigned int *send,
                                             unsigned int *recv, int n,
                                             int *comm_rank_of_min) const {
    if (comm_rank_of_min == nullptr) {
        PROFILE_START("minReduce1<unsigned int>", profile_level);
        MPI_Allreduce((void *)send, (void *)recv, n, MPI_UNSIGNED, MPI_MIN,
                      communicator);
        PROFILE_STOP("minReduce1<unsigned int>", profile_level);
    } else {
        auto tmp = new int[n];
        for (int i = 0; i < n; i++)
            tmp[i] = unsigned_to_signed(send[i]);
        call_minReduce<int>(tmp, n, comm_rank_of_min);
        for (int i = 0; i < n; i++)
            recv[i] = signed_to_unsigned(tmp[i]);
        delete[] tmp;
    }
}
template <>
void MPI_CLASS::call_minReduce<unsigned int>(unsigned int *x, int n,
                                             int *comm_rank_of_min) const {
    if (comm_rank_of_min == nullptr) {
        PROFILE_START("minReduce2<unsigned int>", profile_level);
        auto send = x;
        auto recv = new unsigned int[n];
        MPI_Allreduce(send, recv, n, MPI_UNSIGNED, MPI_MIN, communicator);
        for (int i = 0; i < n; i++)
            x[i] = recv[i];
        delete[] recv;
        PROFILE_STOP("minReduce2<unsigned int>", profile_level);
    } else {
        auto tmp = new int[n];
        for (int i = 0; i < n; i++)
            tmp[i] = unsigned_to_signed(x[i]);
        call_minReduce<int>(tmp, n, comm_rank_of_min);
        for (int i = 0; i < n; i++)
            x[i] = signed_to_unsigned(tmp[i]);
        delete[] tmp;
    }
}
// int
template <>
void MPI_CLASS::call_minReduce<int>(const int *x, int *y, int n,
                                    int *comm_rank_of_min) const {
    PROFILE_START("minReduce1<int>", profile_level);
    if (comm_rank_of_min == nullptr) {
        MPI_Allreduce((void *)x, (void *)y, n, MPI_INT, MPI_MIN, communicator);
    } else {
        auto recv = new IntIntStruct[n];
        auto send = new IntIntStruct[n];
        for (int i = 0; i < n; ++i) {
            send[i].j = x[i];
            send[i].i = comm_rank;
        }
        MPI_Allreduce(send, recv, n, MPI_2INT, MPI_MINLOC, communicator);
        for (int i = 0; i < n; ++i) {
            y[i] = recv[i].j;
            comm_rank_of_min[i] = recv[i].i;
        }
        delete[] recv;
        delete[] send;
    }
    PROFILE_STOP("minReduce1<int>", profile_level);
}
template <>
void MPI_CLASS::call_minReduce<int>(int *x, int n,
                                    int *comm_rank_of_min) const {
    PROFILE_START("minReduce2<int>", profile_level);
    if (comm_rank_of_min == nullptr) {
        auto send = x;
        auto recv = new int[n];
        MPI_Allreduce(send, recv, n, MPI_INT, MPI_MIN, communicator);
        for (int i = 0; i < n; i++)
            x[i] = recv[i];
        delete[] recv;
    } else {
        auto recv = new IntIntStruct[n];
        auto send = new IntIntStruct[n];
        for (int i = 0; i < n; ++i) {
            send[i].j = x[i];
            send[i].i = comm_rank;
        }
        MPI_Allreduce(send, recv, n, MPI_2INT, MPI_MINLOC, communicator);
        for (int i = 0; i < n; ++i) {
            x[i] = recv[i].j;
            comm_rank_of_min[i] = recv[i].i;
        }
        delete[] recv;
        delete[] send;
    }
    PROFILE_STOP("minReduce2<int>", profile_level);
}
// unsigned long int
template <>
void MPI_CLASS::call_minReduce<unsigned long int>(const unsigned long int *send,
                                                  unsigned long int *recv,
                                                  int n,
                                                  int *comm_rank_of_min) const {
    if (comm_rank_of_min == nullptr) {
        PROFILE_START("minReduce1<unsigned long>", profile_level);
        MPI_Allreduce((void *)send, (void *)recv, n, MPI_UNSIGNED_LONG, MPI_MIN,
                      communicator);
        PROFILE_STOP("minReduce1<unsigned long>", profile_level);
    } else {
        auto tmp = new long int[n];
        for (int i = 0; i < n; i++)
            tmp[i] = unsigned_to_signed(send[i]);
        call_minReduce<long int>(tmp, n, comm_rank_of_min);
        for (int i = 0; i < n; i++)
            recv[i] = signed_to_unsigned(tmp[i]);
        delete[] tmp;
    }
}
template <>
void MPI_CLASS::call_minReduce<unsigned long int>(unsigned long int *x, int n,
                                                  int *comm_rank_of_min) const {
    if (comm_rank_of_min == nullptr) {
        PROFILE_START("minReduce2<unsigned long>", profile_level);
        auto send = x;
        auto recv = new unsigned long int[n];
        MPI_Allreduce(send, recv, n, MPI_UNSIGNED_LONG, MPI_MIN, communicator);
        for (int i = 0; i < n; i++)
            x[i] = recv[i];
        delete[] recv;
        PROFILE_STOP("minReduce2<unsigned long>", profile_level);
    } else {
        auto tmp = new long int[n];
        for (int i = 0; i < n; i++)
            tmp[i] = unsigned_to_signed(x[i]);
        call_minReduce<long int>(tmp, n, comm_rank_of_min);
        for (int i = 0; i < n; i++)
            x[i] = signed_to_unsigned(tmp[i]);
        delete[] tmp;
    }
}
// long int
template <>
void MPI_CLASS::call_minReduce<long int>(const long int *x, long int *y, int n,
                                         int *comm_rank_of_min) const {
    PROFILE_START("minReduce1<long int>", profile_level);
    if (comm_rank_of_min == nullptr) {
        MPI_Allreduce((void *)x, (void *)y, n, MPI_LONG, MPI_MIN, communicator);
    } else {
        auto recv = new LongIntStruct[n];
        auto send = new LongIntStruct[n];
        for (int i = 0; i < n; ++i) {
            send[i].j = x[i];
            send[i].i = comm_rank;
        }
        MPI_Allreduce(send, recv, n, MPI_LONG_INT, MPI_MINLOC, communicator);
        for (int i = 0; i < n; ++i) {
            y[i] = recv[i].j;
            comm_rank_of_min[i] = recv[i].i;
        }
        delete[] recv;
        delete[] send;
    }
    PROFILE_STOP("minReduce1<long int>", profile_level);
}
template <>
void MPI_CLASS::call_minReduce<long int>(long int *x, int n,
                                         int *comm_rank_of_min) const {
    PROFILE_START("minReduce2<long int>", profile_level);
    if (comm_rank_of_min == nullptr) {
        auto send = x;
        auto recv = new long int[n];
        MPI_Allreduce(send, recv, n, MPI_LONG, MPI_MIN, communicator);
        for (long int i = 0; i < n; i++)
            x[i] = recv[i];
        delete[] recv;
    } else {
        auto recv = new LongIntStruct[n];
        auto send = new LongIntStruct[n];
        for (int i = 0; i < n; ++i) {
            send[i].j = x[i];
            send[i].i = comm_rank;
        }
        MPI_Allreduce(send, recv, n, MPI_LONG_INT, MPI_MINLOC, communicator);
        for (int i = 0; i < n; ++i) {
            x[i] = recv[i].j;
            comm_rank_of_min[i] = recv[i].i;
        }
        delete[] recv;
        delete[] send;
    }
    PROFILE_STOP("minReduce2<long int>", profile_level);
}
// unsigned long long int
template <>
void MPI_CLASS::call_minReduce<unsigned long long int>(
    const unsigned long long int *send, unsigned long long int *recv, int n,
    int *comm_rank_of_min) const {
    PROFILE_START("minReduce1<long int>", profile_level);
    if (comm_rank_of_min == nullptr) {
        auto x = new long long int[n];
        auto y = new long long int[n];
        for (int i = 0; i < n; i++)
            x[i] = unsigned_to_signed(send[i]);
        MPI_Allreduce((void *)x, (void *)y, n, MPI_LONG_LONG_INT, MPI_MIN,
                      communicator);
        for (int i = 0; i < n; i++)
            recv[i] = signed_to_unsigned(y[i]);
        delete[] x;
        delete[] y;
    } else {
        printf("minReduce<long long int> will use double\n");
        auto tmp = new double[n];
        for (int i = 0; i < n; i++)
            tmp[i] = static_cast<double>(send[i]);
        call_minReduce<double>(tmp, n, comm_rank_of_min);
        for (int i = 0; i < n; i++)
            recv[i] = static_cast<long long int>(tmp[i]);
        delete[] tmp;
    }
    PROFILE_STOP("minReduce1<long int>", profile_level);
}
template <>
void MPI_CLASS::call_minReduce<unsigned long long int>(
    unsigned long long int *x, int n, int *comm_rank_of_min) const {
    auto recv = new unsigned long long int[n];
    call_minReduce<unsigned long long int>(x, recv, n, comm_rank_of_min);
    for (int i = 0; i < n; i++)
        x[i] = recv[i];
    delete[] recv;
}
// long long int
template <>
void MPI_CLASS::call_minReduce<long long int>(const long long int *x,
                                              long long int *y, int n,
                                              int *comm_rank_of_min) const {
    PROFILE_START("minReduce1<long int>", profile_level);
    if (comm_rank_of_min == nullptr) {
        MPI_Allreduce((void *)x, (void *)y, n, MPI_LONG_LONG_INT, MPI_MIN,
                      communicator);
    } else {
        printf("minReduce<long long int> will use double\n");
        auto tmp = new double[n];
        for (int i = 0; i < n; i++)
            tmp[i] = static_cast<double>(x[i]);
        call_minReduce<double>(tmp, n, comm_rank_of_min);
        for (int i = 0; i < n; i++)
            y[i] = static_cast<long long int>(tmp[i]);
        delete[] tmp;
    }
    PROFILE_STOP("minReduce1<long int>", profile_level);
}
template <>
void MPI_CLASS::call_minReduce<long long int>(long long int *x, int n,
                                              int *comm_rank_of_min) const {
    auto recv = new long long int[n];
    call_minReduce<long long int>(x, recv, n, comm_rank_of_min);
    for (int i = 0; i < n; i++)
        x[i] = signed_to_unsigned(recv[i]);
    delete[] recv;
}
// float
template <>
void MPI_CLASS::call_minReduce<float>(const float *x, float *y, int n,
                                      int *comm_rank_of_min) const {
    PROFILE_START("minReduce1<float>", profile_level);
    if (comm_rank_of_min == nullptr) {
        MPI_Allreduce((void *)x, (void *)y, n, MPI_INT, MPI_MIN, communicator);
    } else {
        auto recv = new FloatIntStruct[n];
        auto send = new FloatIntStruct[n];
        for (int i = 0; i < n; ++i) {
            send[i].f = x[i];
            send[i].i = comm_rank;
        }
        MPI_Allreduce(send, recv, n, MPI_FLOAT_INT, MPI_MINLOC, communicator);
        for (int i = 0; i < n; ++i) {
            y[i] = recv[i].f;
            comm_rank_of_min[i] = recv[i].i;
        }
        delete[] recv;
        delete[] send;
    }
    PROFILE_STOP("minReduce1<float>", profile_level);
}
template <>
void MPI_CLASS::call_minReduce<float>(float *x, int n,
                                      int *comm_rank_of_min) const {
    PROFILE_START("minReduce2<float>", profile_level);
    if (comm_rank_of_min == nullptr) {
        auto send = x;
        auto recv = new float[n];
        MPI_Allreduce(send, recv, n, MPI_FLOAT, MPI_MIN, communicator);
        for (int i = 0; i < n; i++)
            x[i] = recv[i];
        delete[] recv;
    } else {
        auto recv = new FloatIntStruct[n];
        auto send = new FloatIntStruct[n];
        for (int i = 0; i < n; ++i) {
            send[i].f = x[i];
            send[i].i = comm_rank;
        }
        MPI_Allreduce(send, recv, n, MPI_FLOAT_INT, MPI_MINLOC, communicator);
        for (int i = 0; i < n; ++i) {
            x[i] = recv[i].f;
            comm_rank_of_min[i] = recv[i].i;
        }
        delete[] recv;
        delete[] send;
    }
    PROFILE_STOP("minReduce2<float>", profile_level);
}
// double
template <>
void MPI_CLASS::call_minReduce<double>(const double *x, double *y, int n,
                                       int *comm_rank_of_min) const {
    PROFILE_START("minReduce1<double>", profile_level);
    if (comm_rank_of_min == nullptr) {
        MPI_Allreduce((void *)x, (void *)y, n, MPI_DOUBLE, MPI_MIN,
                      communicator);
    } else {
        auto recv = new DoubleIntStruct[n];
        auto send = new DoubleIntStruct[n];
        for (int i = 0; i < n; ++i) {
            send[i].d = x[i];
            send[i].i = comm_rank;
        }
        MPI_Allreduce(send, recv, n, MPI_DOUBLE_INT, MPI_MINLOC, communicator);
        for (int i = 0; i < n; ++i) {
            y[i] = recv[i].d;
            comm_rank_of_min[i] = recv[i].i;
        }
        delete[] recv;
        delete[] send;
    }
    PROFILE_STOP("minReduce1<double>", profile_level);
}
template <>
void MPI_CLASS::call_minReduce<double>(double *x, int n,
                                       int *comm_rank_of_min) const {
    PROFILE_START("minReduce2<double>", profile_level);
    if (comm_rank_of_min == nullptr) {
        auto send = x;
        auto recv = new double[n];
        MPI_Allreduce(send, recv, n, MPI_DOUBLE, MPI_MIN, communicator);
        for (int i = 0; i < n; i++)
            x[i] = recv[i];
        delete[] recv;
    } else {
        auto recv = new DoubleIntStruct[n];
        auto send = new DoubleIntStruct[n];
        for (int i = 0; i < n; ++i) {
            send[i].d = x[i];
            send[i].i = comm_rank;
        }
        MPI_Allreduce(send, recv, n, MPI_DOUBLE_INT, MPI_MINLOC, communicator);
        for (int i = 0; i < n; ++i) {
            x[i] = recv[i].d;
            comm_rank_of_min[i] = recv[i].i;
        }
        delete[] recv;
        delete[] send;
    }
    PROFILE_STOP("minReduce2<double>", profile_level);
}
#endif

/************************************************************************
 *  call_maxReduce                                                    *
 *  Note: these specializations are only called when using MPI.          *
 ************************************************************************/
#ifdef USE_MPI
// unsigned char
template <>
void MPI_CLASS::call_maxReduce<unsigned char>(const unsigned char *send,
                                              unsigned char *recv, int n,
                                              int *comm_rank_of_max) const {
    if (comm_rank_of_max == nullptr) {
        PROFILE_START("maxReduce1<unsigned char>", profile_level);
        MPI_Allreduce((void *)send, (void *)recv, n, MPI_UNSIGNED_CHAR, MPI_MAX,
                      communicator);
        PROFILE_STOP("maxReduce1<unsigned char>", profile_level);
    } else {
        auto tmp = new int[n];
        for (int i = 0; i < n; i++)
            tmp[i] = send[i];
        call_maxReduce<int>(tmp, n, comm_rank_of_max);
        for (int i = 0; i < n; i++)
            recv[i] = static_cast<unsigned char>(tmp[i]);
        delete[] tmp;
    }
}
template <>
void MPI_CLASS::call_maxReduce<unsigned char>(unsigned char *x, int n,
                                              int *comm_rank_of_max) const {
    if (comm_rank_of_max == nullptr) {
        PROFILE_START("maxReduce2<unsigned char>", profile_level);
        auto send = x;
        auto recv = new unsigned char[n];
        MPI_Allreduce(send, recv, n, MPI_UNSIGNED_CHAR, MPI_MAX, communicator);
        for (int i = 0; i < n; i++)
            x[i] = recv[i];
        delete[] recv;
        PROFILE_STOP("maxReduce2<unsigned char>", profile_level);
    } else {
        auto tmp = new int[n];
        for (int i = 0; i < n; i++)
            tmp[i] = x[i];
        call_maxReduce<int>(tmp, n, comm_rank_of_max);
        for (int i = 0; i < n; i++)
            x[i] = static_cast<unsigned char>(tmp[i]);
        delete[] tmp;
    }
}
// char
template <>
void MPI_CLASS::call_maxReduce<char>(const char *send, char *recv, int n,
                                     int *comm_rank_of_max) const {
    if (comm_rank_of_max == nullptr) {
        PROFILE_START("maxReduce1<char>", profile_level);
        MPI_Allreduce((void *)send, (void *)recv, n, MPI_SIGNED_CHAR, MPI_MAX,
                      communicator);
        PROFILE_STOP("maxReduce1<char>", profile_level);
    } else {
        auto tmp = new int[n];
        for (int i = 0; i < n; i++)
            tmp[i] = send[i];
        call_maxReduce<int>(tmp, n, comm_rank_of_max);
        for (int i = 0; i < n; i++)
            recv[i] = static_cast<char>(tmp[i]);
        delete[] tmp;
    }
}
template <>
void MPI_CLASS::call_maxReduce<char>(char *x, int n,
                                     int *comm_rank_of_max) const {
    if (comm_rank_of_max == nullptr) {
        PROFILE_START("maxReduce2<char>", profile_level);
        auto send = x;
        auto recv = new char[n];
        MPI_Allreduce(send, recv, n, MPI_SIGNED_CHAR, MPI_MAX, communicator);
        for (int i = 0; i < n; i++)
            x[i] = recv[i];
        delete[] recv;
        PROFILE_STOP("maxReduce2<char>", profile_level);
    } else {
        auto tmp = new int[n];
        for (int i = 0; i < n; i++)
            tmp[i] = x[i];
        call_maxReduce<int>(tmp, n, comm_rank_of_max);
        for (int i = 0; i < n; i++)
            x[i] = static_cast<char>(tmp[i]);
        delete[] tmp;
    }
}
// unsigned int
template <>
void MPI_CLASS::call_maxReduce<unsigned int>(const unsigned int *send,
                                             unsigned int *recv, int n,
                                             int *comm_rank_of_max) const {
    if (comm_rank_of_max == nullptr) {
        PROFILE_START("maxReduce1<unsigned int>", profile_level);
        MPI_Allreduce((void *)send, (void *)recv, n, MPI_UNSIGNED, MPI_MAX,
                      communicator);
        PROFILE_STOP("maxReduce1<unsigned int>", profile_level);
    } else {
        auto tmp = new int[n];
        for (int i = 0; i < n; i++)
            tmp[i] = unsigned_to_signed(send[i]);
        call_maxReduce<int>(tmp, n, comm_rank_of_max);
        for (int i = 0; i < n; i++)
            recv[i] = signed_to_unsigned(tmp[i]);
        delete[] tmp;
    }
}
template <>
void MPI_CLASS::call_maxReduce<unsigned int>(unsigned int *x, int n,
                                             int *comm_rank_of_max) const {
    if (comm_rank_of_max == nullptr) {
        PROFILE_START("maxReduce2<unsigned int>", profile_level);
        auto send = x;
        auto recv = new unsigned int[n];
        MPI_Allreduce(send, recv, n, MPI_UNSIGNED, MPI_MAX, communicator);
        for (int i = 0; i < n; i++)
            x[i] = recv[i];
        delete[] recv;
        PROFILE_STOP("maxReduce2<unsigned int>", profile_level);
    } else {
        auto tmp = new int[n];
        for (int i = 0; i < n; i++)
            tmp[i] = unsigned_to_signed(x[i]);
        call_maxReduce<int>(tmp, n, comm_rank_of_max);
        for (int i = 0; i < n; i++)
            x[i] = signed_to_unsigned(tmp[i]);
        delete[] tmp;
    }
}
// int
template <>
void MPI_CLASS::call_maxReduce<int>(const int *x, int *y, int n,
                                    int *comm_rank_of_max) const {
    PROFILE_START("maxReduce1<int>", profile_level);
    if (comm_rank_of_max == nullptr) {
        MPI_Allreduce((void *)x, (void *)y, n, MPI_INT, MPI_MAX, communicator);
    } else {
        auto recv = new IntIntStruct[n];
        auto send = new IntIntStruct[n];
        for (int i = 0; i < n; ++i) {
            send[i].j = x[i];
            send[i].i = comm_rank;
        }
        MPI_Allreduce(send, recv, n, MPI_2INT, MPI_MAXLOC, communicator);
        for (int i = 0; i < n; ++i) {
            y[i] = recv[i].j;
            comm_rank_of_max[i] = recv[i].i;
        }
        delete[] recv;
        delete[] send;
    }
    PROFILE_STOP("maxReduce1<int>", profile_level);
}
template <>
void MPI_CLASS::call_maxReduce<int>(int *x, int n,
                                    int *comm_rank_of_max) const {
    PROFILE_START("maxReduce2<int>", profile_level);
    if (comm_rank_of_max == nullptr) {
        int *send = x;
        auto recv = new int[n];
        MPI_Allreduce(send, recv, n, MPI_INT, MPI_MAX, communicator);
        for (int i = 0; i < n; i++)
            x[i] = recv[i];
        delete[] recv;
    } else {
        auto recv = new IntIntStruct[n];
        auto send = new IntIntStruct[n];
        for (int i = 0; i < n; ++i) {
            send[i].j = x[i];
            send[i].i = comm_rank;
        }
        MPI_Allreduce(send, recv, n, MPI_2INT, MPI_MAXLOC, communicator);
        for (int i = 0; i < n; ++i) {
            x[i] = recv[i].j;
            comm_rank_of_max[i] = recv[i].i;
        }
        delete[] recv;
        delete[] send;
    }
    PROFILE_STOP("maxReduce2<int>", profile_level);
}
// long int
template <>
void MPI_CLASS::call_maxReduce<long int>(const long int *x, long int *y, int n,
                                         int *comm_rank_of_max) const {
    PROFILE_START("maxReduce1<lond int>", profile_level);
    if (comm_rank_of_max == nullptr) {
        MPI_Allreduce((void *)x, (void *)y, n, MPI_LONG, MPI_MAX, communicator);
    } else {
        auto recv = new LongIntStruct[n];
        auto send = new LongIntStruct[n];
        for (int i = 0; i < n; ++i) {
            send[i].j = x[i];
            send[i].i = comm_rank;
        }
        MPI_Allreduce(send, recv, n, MPI_LONG_INT, MPI_MAXLOC, communicator);
        for (int i = 0; i < n; ++i) {
            y[i] = recv[i].j;
            comm_rank_of_max[i] = recv[i].i;
        }
        delete[] recv;
        delete[] send;
    }
    PROFILE_STOP("maxReduce1<lond int>", profile_level);
}
template <>
void MPI_CLASS::call_maxReduce<long int>(long int *x, int n,
                                         int *comm_rank_of_max) const {
    PROFILE_START("maxReduce2<lond int>", profile_level);
    if (comm_rank_of_max == nullptr) {
        auto send = x;
        auto recv = new long int[n];
        MPI_Allreduce(send, recv, n, MPI_LONG, MPI_MAX, communicator);
        for (int i = 0; i < n; i++)
            x[i] = recv[i];
        delete[] recv;
    } else {
        auto recv = new LongIntStruct[n];
        auto send = new LongIntStruct[n];
        for (int i = 0; i < n; ++i) {
            send[i].j = x[i];
            send[i].i = comm_rank;
        }
        MPI_Allreduce(send, recv, n, MPI_LONG_INT, MPI_MAXLOC, communicator);
        for (int i = 0; i < n; ++i) {
            x[i] = recv[i].j;
            comm_rank_of_max[i] = recv[i].i;
        }
        delete[] recv;
        delete[] send;
    }
    PROFILE_STOP("maxReduce2<lond int>", profile_level);
}
// unsigned long int
template <>
void MPI_CLASS::call_maxReduce<unsigned long int>(const unsigned long int *send,
                                                  unsigned long int *recv,
                                                  int n,
                                                  int *comm_rank_of_max) const {
    if (comm_rank_of_max == nullptr) {
        PROFILE_START("maxReduce1<unsigned long>", profile_level);
        MPI_Allreduce((void *)send, (void *)recv, n, MPI_UNSIGNED_LONG, MPI_MAX,
                      communicator);
        PROFILE_STOP("maxReduce1<unsigned long>", profile_level);
    } else {
        auto tmp = new long int[n];
        for (int i = 0; i < n; i++)
            tmp[i] = unsigned_to_signed(send[i]);
        call_maxReduce<long int>(tmp, n, comm_rank_of_max);
        for (int i = 0; i < n; i++)
            recv[i] = signed_to_unsigned(tmp[i]);
        delete[] tmp;
    }
}
template <>
void MPI_CLASS::call_maxReduce<unsigned long int>(unsigned long int *x, int n,
                                                  int *comm_rank_of_max) const {
    if (comm_rank_of_max == nullptr) {
        PROFILE_START("maxReduce2<unsigned long>", profile_level);
        auto send = x;
        auto recv = new unsigned long int[n];
        MPI_Allreduce(send, recv, n, MPI_UNSIGNED_LONG, MPI_MAX, communicator);
        for (int i = 0; i < n; i++)
            x[i] = recv[i];
        delete[] recv;
        PROFILE_STOP("maxReduce2<unsigned long>", profile_level);
    } else {
        auto tmp = new long int[n];
        for (int i = 0; i < n; i++)
            tmp[i] = unsigned_to_signed(x[i]);
        call_maxReduce<long int>(tmp, n, comm_rank_of_max);
        for (int i = 0; i < n; i++)
            x[i] = signed_to_unsigned(tmp[i]);
        delete[] tmp;
    }
}
// unsigned long long int
template <>
void MPI_CLASS::call_maxReduce<unsigned long long int>(
    const unsigned long long int *send, unsigned long long int *recv, int n,
    int *comm_rank_of_max) const {
    PROFILE_START("maxReduce1<long int>", profile_level);
    if (comm_rank_of_max == nullptr) {
        auto x = new long long int[n];
        auto y = new long long int[n];
        for (int i = 0; i < n; i++)
            x[i] = unsigned_to_signed(send[i]);
        MPI_Allreduce((void *)x, (void *)y, n, MPI_LONG_LONG_INT, MPI_MAX,
                      communicator);
        for (int i = 0; i < n; i++)
            recv[i] = signed_to_unsigned(y[i]);
        delete[] x;
        delete[] y;
    } else {
        printf("maxReduce<long long int> will use double\n");
        auto tmp = new double[n];
        for (int i = 0; i < n; i++)
            tmp[i] = static_cast<double>(send[i]);
        call_maxReduce<double>(tmp, n, comm_rank_of_max);
        for (int i = 0; i < n; i++)
            recv[i] = static_cast<long long int>(tmp[i]);
        delete[] tmp;
    }
    PROFILE_STOP("maxReduce1<long int>", profile_level);
}
template <>
void MPI_CLASS::call_maxReduce<unsigned long long int>(
    unsigned long long int *x, int n, int *comm_rank_of_max) const {
    auto recv = new unsigned long long int[n];
    call_maxReduce<unsigned long long int>(x, recv, n, comm_rank_of_max);
    for (int i = 0; i < n; i++)
        x[i] = recv[i];
    delete[] recv;
}
// long long int
template <>
void MPI_CLASS::call_maxReduce<long long int>(const long long int *x,
                                              long long int *y, int n,
                                              int *comm_rank_of_max) const {
    PROFILE_START("maxReduce1<long int>", profile_level);
    if (comm_rank_of_max == nullptr) {
        MPI_Allreduce((void *)x, (void *)y, n, MPI_LONG_LONG_INT, MPI_MAX,
                      communicator);
    } else {
        printf("maxReduce<long long int> will use double\n");
        auto tmp = new double[n];
        for (int i = 0; i < n; i++)
            tmp[i] = static_cast<double>(x[i]);
        call_maxReduce<double>(tmp, n, comm_rank_of_max);
        for (int i = 0; i < n; i++)
            y[i] = static_cast<long long int>(tmp[i]);
        delete[] tmp;
    }
    PROFILE_STOP("maxReduce1<long int>", profile_level);
}
template <>
void MPI_CLASS::call_maxReduce<long long int>(long long int *x, int n,
                                              int *comm_rank_of_max) const {
    auto recv = new long long int[n];
    call_maxReduce<long long int>(x, recv, n, comm_rank_of_max);
    for (int i = 0; i < n; i++)
        x[i] = signed_to_unsigned(recv[i]);
    delete[] recv;
}
// float
template <>
void MPI_CLASS::call_maxReduce<float>(const float *x, float *y, int n,
                                      int *comm_rank_of_max) const {
    PROFILE_START("maxReduce1<float>", profile_level);
    if (comm_rank_of_max == nullptr) {
        MPI_Allreduce((void *)x, (void *)y, n, MPI_FLOAT, MPI_MAX,
                      communicator);
    } else {
        auto recv = new FloatIntStruct[n];
        auto send = new FloatIntStruct[n];
        for (int i = 0; i < n; ++i) {
            send[i].f = x[i];
            send[i].i = comm_rank;
        }
        MPI_Allreduce(send, recv, n, MPI_FLOAT_INT, MPI_MAXLOC, communicator);
        for (int i = 0; i < n; ++i) {
            y[i] = recv[i].f;
            comm_rank_of_max[i] = recv[i].i;
        }
        delete[] recv;
        delete[] send;
    }
    PROFILE_STOP("maxReduce1<float>", profile_level);
}
template <>
void MPI_CLASS::call_maxReduce<float>(float *x, int n,
                                      int *comm_rank_of_max) const {
    PROFILE_START("maxReduce2<float>", profile_level);
    if (comm_rank_of_max == nullptr) {
        auto send = x;
        auto recv = new float[n];
        MPI_Allreduce(send, recv, n, MPI_FLOAT, MPI_MAX, communicator);
        for (int i = 0; i < n; i++)
            x[i] = recv[i];
        delete[] recv;
    } else {
        auto recv = new FloatIntStruct[n];
        auto send = new FloatIntStruct[n];
        for (int i = 0; i < n; ++i) {
            send[i].f = x[i];
            send[i].i = comm_rank;
        }
        MPI_Allreduce(send, recv, n, MPI_FLOAT_INT, MPI_MAXLOC, communicator);
        for (int i = 0; i < n; ++i) {
            x[i] = recv[i].f;
            comm_rank_of_max[i] = recv[i].i;
        }
        delete[] recv;
        delete[] send;
    }
    PROFILE_STOP("maxReduce2<float>", profile_level);
}
// double
template <>
void MPI_CLASS::call_maxReduce<double>(const double *x, double *y, int n,
                                       int *comm_rank_of_max) const {
    PROFILE_START("maxReduce1<double>", profile_level);
    if (comm_rank_of_max == nullptr) {
        MPI_Allreduce((void *)x, (void *)y, n, MPI_DOUBLE, MPI_MAX,
                      communicator);
    } else {
        auto recv = new DoubleIntStruct[n];
        auto send = new DoubleIntStruct[n];
        for (int i = 0; i < n; ++i) {
            send[i].d = x[i];
            send[i].i = comm_rank;
        }
        MPI_Allreduce(send, recv, n, MPI_DOUBLE_INT, MPI_MAXLOC, communicator);
        for (int i = 0; i < n; ++i) {
            y[i] = recv[i].d;
            comm_rank_of_max[i] = recv[i].i;
        }
        delete[] recv;
        delete[] send;
    }
    PROFILE_STOP("maxReduce1<double>", profile_level);
}
template <>
void MPI_CLASS::call_maxReduce<double>(double *x, int n,
                                       int *comm_rank_of_max) const {
    PROFILE_START("maxReduce2<double>", profile_level);
    if (comm_rank_of_max == nullptr) {
        auto send = x;
        auto recv = new double[n];
        MPI_Allreduce(send, recv, n, MPI_DOUBLE, MPI_MAX, communicator);
        for (int i = 0; i < n; i++)
            x[i] = recv[i];
        delete[] recv;
    } else {
        auto recv = new DoubleIntStruct[n];
        auto send = new DoubleIntStruct[n];
        for (int i = 0; i < n; ++i) {
            send[i].d = x[i];
            send[i].i = comm_rank;
        }
        MPI_Allreduce(send, recv, n, MPI_DOUBLE_INT, MPI_MAXLOC, communicator);
        for (int i = 0; i < n; ++i) {
            x[i] = recv[i].d;
            comm_rank_of_max[i] = recv[i].i;
        }
        delete[] recv;
        delete[] send;
    }
    PROFILE_STOP("maxReduce2<double>", profile_level);
}
#endif

/************************************************************************
 *  bcast                                                                *
 *  Note: these specializations are only called when using MPI.          *
 ************************************************************************/
#ifdef USE_MPI
// char
template <>
void MPI_CLASS::call_bcast<unsigned char>(unsigned char *x, int n,
                                          int root) const {
    PROFILE_START("bcast<unsigned char>", profile_level);
    MPI_Bcast(x, n, MPI_UNSIGNED_CHAR, root, communicator);
    PROFILE_STOP("bcast<unsigned char>", profile_level);
}
template <> void MPI_CLASS::call_bcast<char>(char *x, int n, int root) const {
    PROFILE_START("bcast<char>", profile_level);
    MPI_Bcast(x, n, MPI_CHAR, root, communicator);
    PROFILE_STOP("bcast<char>", profile_level);
}
// int
template <>
void MPI_CLASS::call_bcast<unsigned int>(unsigned int *x, int n,
                                         int root) const {
    PROFILE_START("bcast<unsigned int>", profile_level);
    MPI_Bcast(x, n, MPI_UNSIGNED, root, communicator);
    PROFILE_STOP("bcast<unsigned int>", profile_level);
}
template <> void MPI_CLASS::call_bcast<int>(int *x, int n, int root) const {
    PROFILE_START("bcast<int>", profile_level);
    MPI_Bcast(x, n, MPI_INT, root, communicator);
    PROFILE_STOP("bcast<int>", profile_level);
}
// float
template <> void MPI_CLASS::call_bcast<float>(float *x, int n, int root) const {
    PROFILE_START("bcast<float>", profile_level);
    MPI_Bcast(x, n, MPI_FLOAT, root, communicator);
    PROFILE_STOP("bcast<float>", profile_level);
}
// double
template <>
void MPI_CLASS::call_bcast<double>(double *x, int n, int root) const {
    PROFILE_START("bcast<double>", profile_level);
    MPI_Bcast(x, n, MPI_DOUBLE, root, communicator);
    PROFILE_STOP("bcast<double>", profile_level);
}
#else
// We need a concrete instantiation of bcast<char>(x,n,root);
template <> void MPI_CLASS::call_bcast<char>(char *, int, int) const {}
#endif

/************************************************************************
 *  Perform a global barrier across all processors.                      *
 ************************************************************************/
void MPI_CLASS::barrier() const {
#ifdef USE_MPI
    MPI_Barrier(communicator);
#endif
}

/************************************************************************
 *  Send data array to another processor.                                *
 *  Note: these specializations are only called when using MPI.          *
 ************************************************************************/
#ifdef USE_MPI
// char
template <>
void MPI_CLASS::send<char>(const char *buf, int length, int recv_proc_number,
                           int tag) const {
    // Set the tag to 0 if it is < 0
    tag = (tag >= 0) ? tag : 0;
    MPI_INSIST(tag <= d_maxTag, "Maximum tag value exceeded");
    // Send the data
    PROFILE_START("send<char>", profile_level);
    MPI_Send((void *)buf, length, MPI_CHAR, recv_proc_number, tag,
             communicator);
    PROFILE_STOP("send<char>", profile_level);
}
// int
template <>
void MPI_CLASS::send<int>(const int *buf, int length, int recv_proc_number,
                          int tag) const {
    // Set the tag to 0 if it is < 0
    tag = (tag >= 0) ? tag : 0;
    MPI_INSIST(tag <= d_maxTag, "Maximum tag value exceeded");
    // Send the data
    PROFILE_START("send<int>", profile_level);
    MPI_Send((void *)buf, length, MPI_INT, recv_proc_number, tag, communicator);
    PROFILE_STOP("send<int>", profile_level);
}
// float
template <>
void MPI_CLASS::send<float>(const float *buf, int length, int recv_proc_number,
                            int tag) const {
    // Set the tag to 0 if it is < 0
    tag = (tag >= 0) ? tag : 0;
    MPI_INSIST(tag <= d_maxTag, "Maximum tag value exceeded");
    // Send the data
    PROFILE_START("send<float>", profile_level);
    MPI_Send((void *)buf, length, MPI_FLOAT, recv_proc_number, tag,
             communicator);
    PROFILE_STOP("send<float>", profile_level);
}
// double
template <>
void MPI_CLASS::send<double>(const double *buf, int length,
                             int recv_proc_number, int tag) const {
    // Set the tag to 0 if it is < 0
    tag = (tag >= 0) ? tag : 0;
    MPI_INSIST(tag <= d_maxTag, "Maximum tag value exceeded");
    // Send the data
    PROFILE_START("send<double>", profile_level);
    MPI_Send((void *)buf, length, MPI_DOUBLE, recv_proc_number, tag,
             communicator);
    PROFILE_STOP("send<double>", profile_level);
}
#else
// We need a concrete instantiation of send for use without MPI
template <>
void MPI_CLASS::send<char>(const char *buf, int length, int, int tag) const {
    MPI_INSIST(tag <= d_maxTag, "Maximum tag value exceeded");
    MPI_INSIST(tag >= 0, "tag must be >= 0");
    PROFILE_START("send<char>", profile_level);
    auto id = getRequest(communicator, tag);
    auto it = global_isendrecv_list.find(id);
    MPI_INSIST(it == global_isendrecv_list.end(),
               "send must be paired with a previous call to irecv in serial");
    MPI_ASSERT(it->second.status == 2);
    memcpy((char *)it->second.data, buf, length);
    global_isendrecv_list.erase(it);
    PROFILE_START("send<char>", profile_level);
}
#endif

/************************************************************************
 *  Non-blocking send data array to another processor.                   *
 *  Note: these specializations are only called when using MPI.          *
 ************************************************************************/
#ifdef USE_MPI
// char
template <>
MPI_Request MPI_CLASS::Isend<char>(const char *buf, int length, int recv_proc,
                                   int tag) const {
    MPI_INSIST(tag <= d_maxTag, "Maximum tag value exceeded");
    MPI_INSIST(tag >= 0, "tag must be >= 0");
    MPI_Request request;
    PROFILE_START("Isend<char>", profile_level);
    MPI_Isend((void *)buf, length, MPI_CHAR, recv_proc, tag, communicator,
              &request);
    PROFILE_STOP("Isend<char>", profile_level);
    return request;
}
// int
template <>
MPI_Request MPI_CLASS::Isend<int>(const int *buf, int length, int recv_proc,
                                  int tag) const {
    MPI_INSIST(tag <= d_maxTag, "Maximum tag value exceeded");
    MPI_INSIST(tag >= 0, "tag must be >= 0");
    MPI_Request request;
    PROFILE_START("Isend<int>", profile_level);
    MPI_Isend((void *)buf, length, MPI_INT, recv_proc, tag, communicator,
              &request);
    PROFILE_STOP("Isend<int>", profile_level);
    return request;
}
// float
template <>
MPI_Request MPI_CLASS::Isend<float>(const float *buf, int length, int recv_proc,
                                    int tag) const {
    MPI_INSIST(tag <= d_maxTag, "Maximum tag value exceeded");
    MPI_INSIST(tag >= 0, "tag must be >= 0");
    MPI_Request request;
    PROFILE_START("Isend<float>", profile_level);
    MPI_Isend((void *)buf, length, MPI_FLOAT, recv_proc, tag, communicator,
              &request);
    PROFILE_STOP("Isend<float>", profile_level);
    return request;
}
// double
template <>
MPI_Request MPI_CLASS::Isend<double>(const double *buf, int length,
                                     int recv_proc, int tag) const {
    MPI_INSIST(tag <= d_maxTag, "Maximum tag value exceeded");
    MPI_INSIST(tag >= 0, "tag must be >= 0");
    MPI_Request request;
    PROFILE_START("Isend<double>", profile_level);
    MPI_Isend((void *)buf, length, MPI_DOUBLE, recv_proc, tag, communicator,
              &request);
    PROFILE_STOP("Isend<double>", profile_level);
    return request;
}
#else
// We need a concrete instantiation of send for use without mpi
template <>
MPI_Request MPI_CLASS::Isend<char>(const char *buf, int length, int,
                                   int tag) const {
    MPI_INSIST(tag <= d_maxTag, "Maximum tag value exceeded");
    MPI_INSIST(tag >= 0, "tag must be >= 0");
    PROFILE_START("Isend<char>", profile_level);
    auto id = getRequest(communicator, tag);
    auto it = global_isendrecv_list.find(id);
    if (it == global_isendrecv_list.end()) {
        // We are calling isend first
        Isendrecv_struct data;
        data.data = buf;
        data.status = 1;
        global_isendrecv_list.insert(
            std::pair<MPI_Request, Isendrecv_struct>(id, data));
    } else {
        // We called irecv first
        MPI_ASSERT(it->second.status == 2);
        memcpy((char *)it->second.data, buf, length);
        global_isendrecv_list.erase(it);
    }
    PROFILE_STOP("Isend<char>", profile_level);
    return id;
}
#endif

/************************************************************************
 *  Send byte array to another processor.                                *
 ************************************************************************/
void MPI_CLASS::sendBytes(const void *buf, int number_bytes,
                          int recv_proc_number, int tag) const {
    MPI_INSIST(tag <= d_maxTag, "Maximum tag value exceeded");
    MPI_INSIST(tag >= 0, "tag must be >= 0");
    send<char>((const char *)buf, number_bytes, recv_proc_number, tag);
}

/************************************************************************
 *  Non-blocking send byte array to another processor.                   *
 ************************************************************************/
MPI_Request MPI_CLASS::IsendBytes(const void *buf, int number_bytes,
                                  const int recv_proc, const int tag) const {
    MPI_INSIST(tag <= d_maxTag, "Maximum tag value exceeded");
    MPI_INSIST(tag >= 0, "tag must be >= 0");
    return Isend<char>((const char *)buf, number_bytes, recv_proc, tag);
}

/************************************************************************
 *  Recieve data array to another processor.                             *
 *  Note: these specializations are only called when using MPI.          *
 ************************************************************************/
#ifdef USE_MPI
// char
template <>
void MPI_CLASS::recv<char>(char *buf, int &length, int send_proc_number,
                           const bool get_length, int tag) const {
    // Set the tag to 0 if it is < 0
    tag = (tag >= 0) ? tag : 0;
    MPI_INSIST(tag <= d_maxTag, "Maximum tag value exceeded");
    PROFILE_START("recv<char>", profile_level);
    // Get the recieve length if necessary
    if (get_length) {
        int bytes = this->probe(send_proc_number, tag);
        int recv_length = bytes / sizeof(char);
        MPI_INSIST(length >= recv_length,
                   "Recived length is larger than allocated array");
        length = recv_length;
    }
    // Send the data
    MPI_Status status;
    MPI_Recv((void *)buf, length, MPI_CHAR, send_proc_number, tag, communicator,
             &status);
    PROFILE_STOP("recv<char>", profile_level);
}
// int
template <>
void MPI_CLASS::recv<int>(int *buf, int &length, int send_proc_number,
                          const bool get_length, int tag) const {
    // Set the tag to 0 if it is < 0
    tag = (tag >= 0) ? tag : 0;
    MPI_INSIST(tag <= d_maxTag, "Maximum tag value exceeded");
    PROFILE_START("recv<int>", profile_level);
    // Get the recieve length if necessary
    if (get_length) {
        int bytes = this->probe(send_proc_number, tag);
        int recv_length = bytes / sizeof(int);
        MPI_INSIST(length >= recv_length,
                   "Recived length is larger than allocated array");
        length = recv_length;
    }
    // Send the data
    MPI_Status status;
    MPI_Recv((void *)buf, length, MPI_INT, send_proc_number, tag, communicator,
             &status);
    PROFILE_STOP("recv<int>", profile_level);
}
// float
template <>
void MPI_CLASS::recv<float>(float *buf, int &length, int send_proc_number,
                            const bool get_length, int tag) const {
    // Set the tag to 0 if it is < 0
    tag = (tag >= 0) ? tag : 0;
    MPI_INSIST(tag <= d_maxTag, "Maximum tag value exceeded");
    PROFILE_START("recv<float>", profile_level);
    // Get the recieve length if necessary
    if (get_length) {
        int bytes = this->probe(send_proc_number, tag);
        int recv_length = bytes / sizeof(float);
        MPI_INSIST(length >= recv_length,
                   "Recived length is larger than allocated array");
        length = recv_length;
    }
    // Send the data
    MPI_Status status;
    MPI_Recv((void *)buf, length, MPI_FLOAT, send_proc_number, tag,
             communicator, &status);
    PROFILE_STOP("recv<float>", profile_level);
}
// double
template <>
void MPI_CLASS::recv<double>(double *buf, int &length, int send_proc_number,
                             const bool get_length, int tag) const {
    // Set the tag to 0 if it is < 0
    tag = (tag >= 0) ? tag : 0;
    MPI_INSIST(tag <= d_maxTag, "Maximum tag value exceeded");
    PROFILE_START("recv<double>", profile_level);
    // Get the recieve length if necessary
    if (get_length) {
        int bytes = this->probe(send_proc_number, tag);
        int recv_length = bytes / sizeof(double);
        MPI_INSIST(length >= recv_length,
                   "Recived length is larger than allocated array");
        length = recv_length;
    }
    // Send the data
    MPI_Status status;
    MPI_Recv((void *)buf, length, MPI_DOUBLE, send_proc_number, tag,
             communicator, &status);
    PROFILE_STOP("recv<double>", profile_level);
}
#else
// We need a concrete instantiation of recv for use without mpi
template <>
void MPI_CLASS::recv<char>(char *buf, int &length, int, const bool,
                           int tag) const {
    MPI_INSIST(tag <= d_maxTag, "Maximum tag value exceeded");
    MPI_INSIST(tag >= 0, "tag must be >= 0");
    PROFILE_START("recv<char>", profile_level);
    auto id = getRequest(communicator, tag);
    auto it = global_isendrecv_list.find(id);
    MPI_INSIST(it != global_isendrecv_list.end(),
               "recv must be paired with a previous call to isend in serial");
    MPI_ASSERT(it->second.status == 1);
    memcpy(buf, it->second.data, length);
    global_isendrecv_list.erase(it);
    PROFILE_STOP("recv<char>", profile_level);
}
#endif

/************************************************************************
 *  Non-blocking recieve data array to another processor.                *
 *  Note: these specializations are only called when using MPI.          *
 ************************************************************************/
#ifdef USE_MPI
// char
template <>
MPI_Request MPI_CLASS::Irecv<char>(char *buf, int length, int send_proc,
                                   int tag) const {
    MPI_INSIST(tag <= d_maxTag, "Maximum tag value exceeded");
    MPI_INSIST(tag >= 0, "tag must be >= 0");
    MPI_Request request;
    PROFILE_START("Irecv<char>", profile_level);
    MPI_Irecv((void *)buf, length, MPI_CHAR, send_proc, tag, communicator,
              &request);
    PROFILE_STOP("Irecv<char>", profile_level);
    return request;
}
// int
template <>
MPI_Request MPI_CLASS::Irecv<int>(int *buf, int length, int send_proc,
                                  int tag) const {
    MPI_INSIST(tag <= d_maxTag, "Maximum tag value exceeded");
    MPI_INSIST(tag >= 0, "tag must be >= 0");
    MPI_Request request;
    PROFILE_START("Irecv<int>", profile_level);
    MPI_Irecv((void *)buf, length, MPI_INT, send_proc, tag, communicator,
              &request);
    PROFILE_STOP("Irecv<int>", profile_level);
    return request;
}
// float
template <>
MPI_Request MPI_CLASS::Irecv<float>(float *buf, int length, int send_proc,
                                    int tag) const {
    MPI_INSIST(tag <= d_maxTag, "Maximum tag value exceeded");
    MPI_INSIST(tag >= 0, "tag must be >= 0");
    MPI_Request request;
    PROFILE_START("Irecv<float>", profile_level);
    MPI_Irecv((void *)buf, length, MPI_FLOAT, send_proc, tag, communicator,
              &request);
    PROFILE_STOP("Irecv<float>", profile_level);
    return request;
}
// double
template <>
MPI_Request MPI_CLASS::Irecv<double>(double *buf, int length, int send_proc,
                                     int tag) const {
    MPI_INSIST(tag <= d_maxTag, "Maximum tag value exceeded");
    MPI_INSIST(tag >= 0, "tag must be >= 0");
    MPI_Request request;
    PROFILE_START("Irecv<double>", profile_level);
    MPI_Irecv((void *)buf, length, MPI_DOUBLE, send_proc, tag, communicator,
              &request);
    PROFILE_STOP("Irecv<double>", profile_level);
    return request;
}
#else
// We need a concrete instantiation of irecv for use without mpi
template <>
MPI_Request MPI_CLASS::Irecv<char>(char *buf, int length, int, int tag) const {
    MPI_INSIST(tag <= d_maxTag, "Maximum tag value exceeded");
    MPI_INSIST(tag >= 0, "tag must be >= 0");
    PROFILE_START("Irecv<char>", profile_level);
    auto id = getRequest(communicator, tag);
    auto it = global_isendrecv_list.find(id);
    if (it == global_isendrecv_list.end()) {
        // We are calling Irecv first
        Isendrecv_struct data;
        data.data = buf;
        data.status = 2;
        global_isendrecv_list.insert(
            std::pair<MPI_Request, Isendrecv_struct>(id, data));
    } else {
        // We called Isend first
        MPI_ASSERT(it->second.status == 1);
        memcpy(buf, it->second.data, length);
        global_isendrecv_list.erase(it);
    }
    PROFILE_STOP("Irecv<char>", profile_level);
    return id;
}
#endif

/************************************************************************
 *  Recieve byte array to another processor.                             *
 ************************************************************************/
void MPI_CLASS::recvBytes(void *buf, int &number_bytes, int send_proc,
                          int tag) const {
    recv<char>((char *)buf, number_bytes, send_proc, false, tag);
}

/************************************************************************
 *  Recieve byte array to another processor.                             *
 ************************************************************************/
MPI_Request MPI_CLASS::IrecvBytes(void *buf, int number_bytes, int send_proc,
                                  int tag) const {
    MPI_INSIST(tag <= d_maxTag, "Maximum tag value exceeded");
    MPI_INSIST(tag >= 0, "tag must be >= 0");
    return Irecv<char>((char *)buf, number_bytes, send_proc, tag);
}

/************************************************************************
 *  sendrecv                                                             *
 ************************************************************************/
#if defined(USE_MPI)
template <>
void MPI_CLASS::sendrecv<char>(const char *sendbuf, int sendcount, int dest,
                               int sendtag, char *recvbuf, int recvcount,
                               int source, int recvtag) const {
    PROFILE_START("sendrecv<char>", profile_level);
    MPI_Sendrecv(sendbuf, sendcount, MPI_CHAR, dest, sendtag, recvbuf,
                 recvcount, MPI_CHAR, source, recvtag, communicator,
                 MPI_STATUS_IGNORE);
    PROFILE_STOP("sendrecv<char>", profile_level);
}
template <>
void MPI_CLASS::sendrecv<int>(const int *sendbuf, int sendcount, int dest,
                              int sendtag, int *recvbuf, int recvcount,
                              int source, int recvtag) const {
    PROFILE_START("sendrecv<int>", profile_level);
    MPI_Sendrecv(sendbuf, sendcount, MPI_INT, dest, sendtag, recvbuf, recvcount,
                 MPI_INT, source, recvtag, communicator, MPI_STATUS_IGNORE);
    PROFILE_STOP("sendrecv<int>", profile_level);
}
template <>
void MPI_CLASS::sendrecv<float>(const float *sendbuf, int sendcount, int dest,
                                int sendtag, float *recvbuf, int recvcount,
                                int source, int recvtag) const {
    PROFILE_START("sendrecv<float>", profile_level);
    MPI_Sendrecv(sendbuf, sendcount, MPI_FLOAT, dest, sendtag, recvbuf,
                 recvcount, MPI_FLOAT, source, recvtag, communicator,
                 MPI_STATUS_IGNORE);
    PROFILE_STOP("sendrecv<float>", profile_level);
}
template <>
void MPI_CLASS::sendrecv<double>(const double *sendbuf, int sendcount, int dest,
                                 int sendtag, double *recvbuf, int recvcount,
                                 int source, int recvtag) const {
    PROFILE_START("sendrecv<double>", profile_level);
    MPI_Sendrecv(sendbuf, sendcount, MPI_DOUBLE, dest, sendtag, recvbuf,
                 recvcount, MPI_DOUBLE, source, recvtag, communicator,
                 MPI_STATUS_IGNORE);
    PROFILE_STOP("sendrecv<double>", profile_level);
}
#endif

/************************************************************************
 *  allGather                                                            *
 *  Note: these specializations are only called when using MPI.          *
 ************************************************************************/
#ifdef USE_MPI
// unsigned char
template <>
void MPI_CLASS::call_allGather<unsigned char>(const unsigned char &x_in,
                                              unsigned char *x_out) const {
    PROFILE_START("allGather<unsigned char>", profile_level);
    MPI_Allgather((void *)&x_in, 1, MPI_UNSIGNED_CHAR, (void *)x_out, 1,
                  MPI_UNSIGNED_CHAR, communicator);
    PROFILE_STOP("allGather<unsigned char>", profile_level);
}
template <>
void MPI_CLASS::call_allGather<unsigned char>(const unsigned char *x_in,
                                              int size_in, unsigned char *x_out,
                                              int *size_out,
                                              int *disp_out) const {
    PROFILE_START("allGatherv<unsigned char>", profile_level);
    MPI_Allgatherv((void *)x_in, size_in, MPI_CHAR, (void *)x_out, size_out,
                   disp_out, MPI_CHAR, communicator);
    PROFILE_STOP("allGatherv<unsigned char>", profile_level);
}
// char
template <>
void MPI_CLASS::call_allGather<char>(const char &x_in, char *x_out) const {
    PROFILE_START("allGather<char>", profile_level);
    MPI_Allgather((void *)&x_in, 1, MPI_CHAR, (void *)x_out, 1, MPI_CHAR,
                  communicator);
    PROFILE_STOP("allGather<char>", profile_level);
}
template <>
void MPI_CLASS::call_allGather<char>(const char *x_in, int size_in, char *x_out,
                                     int *size_out, int *disp_out) const {
    PROFILE_START("allGatherv<char>", profile_level);
    MPI_Allgatherv((void *)x_in, size_in, MPI_CHAR, (void *)x_out, size_out,
                   disp_out, MPI_CHAR, communicator);
    PROFILE_STOP("allGatherv<char>", profile_level);
}
// unsigned int
template <>
void MPI_CLASS::call_allGather<unsigned int>(const unsigned int &x_in,
                                             unsigned int *x_out) const {
    PROFILE_START("allGather<unsigned int>", profile_level);
    MPI_Allgather((void *)&x_in, 1, MPI_UNSIGNED, (void *)x_out, 1,
                  MPI_UNSIGNED, communicator);
    PROFILE_STOP("allGather<unsigned int>", profile_level);
}
template <>
void MPI_CLASS::call_allGather<unsigned int>(const unsigned int *x_in,
                                             int size_in, unsigned int *x_out,
                                             int *size_out,
                                             int *disp_out) const {
    PROFILE_START("allGatherv<unsigned int>", profile_level);
    MPI_Allgatherv((void *)x_in, size_in, MPI_UNSIGNED, (void *)x_out, size_out,
                   disp_out, MPI_UNSIGNED, communicator);
    PROFILE_STOP("allGatherv<unsigned int>", profile_level);
}
// int
template <>
void MPI_CLASS::call_allGather<int>(const int &x_in, int *x_out) const {
    PROFILE_START("allGather<int>", profile_level);
    MPI_Allgather((void *)&x_in, 1, MPI_INT, (void *)x_out, 1, MPI_INT,
                  communicator);
    PROFILE_STOP("allGather<int>", profile_level);
}
template <>
void MPI_CLASS::call_allGather<int>(const int *x_in, int size_in, int *x_out,
                                    int *size_out, int *disp_out) const {
    PROFILE_START("allGatherv<int>", profile_level);
    MPI_Allgatherv((void *)x_in, size_in, MPI_INT, (void *)x_out, size_out,
                   disp_out, MPI_INT, communicator);
    PROFILE_STOP("allGatherv<int>", profile_level);
}
// unsigned long int
template <>
void MPI_CLASS::call_allGather<unsigned long int>(
    const unsigned long int &x_in, unsigned long int *x_out) const {
    PROFILE_START("allGather<unsigned long>", profile_level);
    MPI_Allgather((void *)&x_in, 1, MPI_UNSIGNED_LONG, (void *)x_out, 1,
                  MPI_UNSIGNED_LONG, communicator);
    PROFILE_STOP("allGather<unsigned long>", profile_level);
}
template <>
void MPI_CLASS::call_allGather<unsigned long int>(const unsigned long int *x_in,
                                                  int size_in,
                                                  unsigned long int *x_out,
                                                  int *size_out,
                                                  int *disp_out) const {
    PROFILE_START("allGatherv<unsigned long>", profile_level);
    MPI_Allgatherv((void *)x_in, size_in, MPI_UNSIGNED_LONG, (void *)x_out,
                   size_out, disp_out, MPI_UNSIGNED_LONG, communicator);
    PROFILE_STOP("allGatherv<unsigned long>", profile_level);
}
// long int
template <>
void MPI_CLASS::call_allGather<long int>(const long int &x_in,
                                         long int *x_out) const {
    PROFILE_START("allGather<long int>", profile_level);
    MPI_Allgather((void *)&x_in, 1, MPI_LONG, (void *)x_out, 1, MPI_LONG,
                  communicator);
    PROFILE_STOP("allGather<long int>", profile_level);
}
template <>
void MPI_CLASS::call_allGather<long int>(const long int *x_in, int size_in,
                                         long int *x_out, int *size_out,
                                         int *disp_out) const {
    PROFILE_START("allGatherv<long int>", profile_level);
    MPI_Allgatherv((void *)x_in, size_in, MPI_LONG, (void *)x_out, size_out,
                   disp_out, MPI_LONG, communicator);
    PROFILE_STOP("allGatherv<long int>", profile_level);
}
// float
template <>
void MPI_CLASS::call_allGather<float>(const float &x_in, float *x_out) const {
    PROFILE_START("allGather<float>", profile_level);
    MPI_Allgather((void *)&x_in, 1, MPI_FLOAT, (void *)x_out, 1, MPI_FLOAT,
                  communicator);
    PROFILE_STOP("allGather<float>", profile_level);
}
template <>
void MPI_CLASS::call_allGather<float>(const float *x_in, int size_in,
                                      float *x_out, int *size_out,
                                      int *disp_out) const {
    PROFILE_START("allGatherv<float>", profile_level);
    MPI_Allgatherv((void *)x_in, size_in, MPI_FLOAT, (void *)x_out, size_out,
                   disp_out, MPI_FLOAT, communicator);
    PROFILE_STOP("allGatherv<float>", profile_level);
}
// double
template <>
void MPI_CLASS::call_allGather<double>(const double &x_in,
                                       double *x_out) const {
    PROFILE_START("allGather<double>", profile_level);
    MPI_Allgather((void *)&x_in, 1, MPI_DOUBLE, (void *)x_out, 1, MPI_DOUBLE,
                  communicator);
    PROFILE_STOP("allGather<double>", profile_level);
}
template <>
void MPI_CLASS::call_allGather<double>(const double *x_in, int size_in,
                                       double *x_out, int *size_out,
                                       int *disp_out) const {
    PROFILE_START("allGatherv<double>", profile_level);
    MPI_Allgatherv((void *)x_in, size_in, MPI_DOUBLE, (void *)x_out, size_out,
                   disp_out, MPI_DOUBLE, communicator);
    PROFILE_STOP("allGatherv<double>", profile_level);
}
#else
// We need a concrete instantiation of call_allGather<char>(x_in,size_in,x_out,size_out)
template <>
void MPI_CLASS::call_allGather<char>(const char *, int, char *, int *,
                                     int *) const {
    MPI_ERROR("Internal error in communicator (allGather) ");
}
#endif

/************************************************************************
 *  allToAll                                                             *
 *  Note: these specializations are only called when using MPI.          *
 ************************************************************************/
#ifdef USE_MPI
template <>
void MPI_CLASS::allToAll<unsigned char>(int n, const unsigned char *send,
                                        unsigned char *recv) const {
    PROFILE_START("allToAll<unsigned char>", profile_level);
    MPI_Alltoall((void *)send, n, MPI_UNSIGNED_CHAR, (void *)recv, n,
                 MPI_UNSIGNED_CHAR, communicator);
    PROFILE_STOP("allToAll<unsigned char>", profile_level);
}
template <>
void MPI_CLASS::allToAll<char>(int n, const char *send, char *recv) const {
    PROFILE_START("allToAll<char>", profile_level);
    MPI_Alltoall((void *)send, n, MPI_CHAR, (void *)recv, n, MPI_CHAR,
                 communicator);
    PROFILE_STOP("allToAll<char>", profile_level);
}
template <>
void MPI_CLASS::allToAll<unsigned int>(int n, const unsigned int *send,
                                       unsigned int *recv) const {
    PROFILE_START("allToAll<unsigned int>", profile_level);
    MPI_Alltoall((void *)send, n, MPI_UNSIGNED, (void *)recv, n, MPI_UNSIGNED,
                 communicator);
    PROFILE_STOP("allToAll<unsigned int>", profile_level);
}
template <>
void MPI_CLASS::allToAll<int>(int n, const int *send, int *recv) const {
    PROFILE_START("allToAll<int>", profile_level);
    MPI_Alltoall((void *)send, n, MPI_INT, (void *)recv, n, MPI_INT,
                 communicator);
    PROFILE_STOP("allToAll<int>", profile_level);
}
template <>
void MPI_CLASS::allToAll<unsigned long int>(int n,
                                            const unsigned long int *send,
                                            unsigned long int *recv) const {
    PROFILE_START("allToAll<unsigned long>", profile_level);
    MPI_Alltoall((void *)send, n, MPI_UNSIGNED_LONG, (void *)recv, n,
                 MPI_UNSIGNED_LONG, communicator);
    PROFILE_STOP("allToAll<unsigned long>", profile_level);
}
template <>
void MPI_CLASS::allToAll<long int>(int n, const long int *send,
                                   long int *recv) const {
    PROFILE_START("allToAll<long int>", profile_level);
    MPI_Alltoall((void *)send, n, MPI_LONG, (void *)recv, n, MPI_LONG,
                 communicator);
    PROFILE_STOP("allToAll<long int>", profile_level);
}
template <>
void MPI_CLASS::allToAll<float>(int n, const float *send, float *recv) const {
    PROFILE_START("allToAll<float>", profile_level);
    MPI_Alltoall((void *)send, n, MPI_FLOAT, (void *)recv, n, MPI_FLOAT,
                 communicator);
    PROFILE_STOP("allToAll<float>", profile_level);
}
template <>
void MPI_CLASS::allToAll<double>(int n, const double *send,
                                 double *recv) const {
    PROFILE_START("allToAll<double>", profile_level);
    MPI_Alltoall((void *)send, n, MPI_DOUBLE, (void *)recv, n, MPI_DOUBLE,
                 communicator);
    PROFILE_STOP("allToAll<double>", profile_level);
}
#endif

/************************************************************************
 *  call_allToAll                                                        *
 *  Note: these specializations are only called when using MPI.          *
 ************************************************************************/
#ifdef USE_MPI
// unsigned char
template <>
void MPI_CLASS::call_allToAll<unsigned char>(
    const unsigned char *send_data, const int send_cnt[], const int send_disp[],
    unsigned char *recv_data, const int *recv_cnt, const int *recv_disp) const {
    PROFILE_START("allToAllv<unsigned char>", profile_level);
    MPI_Alltoallv((void *)send_data, (int *)send_cnt, (int *)send_disp,
                  MPI_UNSIGNED_CHAR, (void *)recv_data, (int *)recv_cnt,
                  (int *)recv_disp, MPI_UNSIGNED_CHAR, communicator);
    PROFILE_STOP("allToAllv<unsigned char>", profile_level);
}
// char
template <>
void MPI_CLASS::call_allToAll<char>(const char *send_data, const int send_cnt[],
                                    const int send_disp[], char *recv_data,
                                    const int *recv_cnt,
                                    const int *recv_disp) const {
    PROFILE_START("allToAllv<char>", profile_level);
    MPI_Alltoallv((void *)send_data, (int *)send_cnt, (int *)send_disp,
                  MPI_CHAR, (void *)recv_data, (int *)recv_cnt,
                  (int *)recv_disp, MPI_CHAR, communicator);
    PROFILE_STOP("allToAllv<char>", profile_level);
}
// unsigned int
template <>
void MPI_CLASS::call_allToAll<unsigned int>(
    const unsigned int *send_data, const int send_cnt[], const int send_disp[],
    unsigned int *recv_data, const int *recv_cnt, const int *recv_disp) const {
    PROFILE_START("allToAllv<unsigned int>", profile_level);
    MPI_Alltoallv((void *)send_data, (int *)send_cnt, (int *)send_disp,
                  MPI_UNSIGNED, (void *)recv_data, (int *)recv_cnt,
                  (int *)recv_disp, MPI_UNSIGNED, communicator);
    PROFILE_STOP("allToAllv<unsigned int>", profile_level);
}
// int
template <>
void MPI_CLASS::call_allToAll<int>(const int *send_data, const int send_cnt[],
                                   const int send_disp[], int *recv_data,
                                   const int *recv_cnt,
                                   const int *recv_disp) const {
    PROFILE_START("allToAllv<int>", profile_level);
    MPI_Alltoallv((void *)send_data, (int *)send_cnt, (int *)send_disp, MPI_INT,
                  (void *)recv_data, (int *)recv_cnt, (int *)recv_disp, MPI_INT,
                  communicator);
    PROFILE_STOP("allToAllv<int>", profile_level);
}
// unsigned long int
template <>
void MPI_CLASS::call_allToAll<unsigned long int>(
    const unsigned long int *send_data, const int send_cnt[],
    const int send_disp[], unsigned long int *recv_data, const int *recv_cnt,
    const int *recv_disp) const {
    PROFILE_START("allToAllv<unsigned long>", profile_level);
    MPI_Alltoallv((void *)send_data, (int *)send_cnt, (int *)send_disp,
                  MPI_UNSIGNED_LONG, (void *)recv_data, (int *)recv_cnt,
                  (int *)recv_disp, MPI_UNSIGNED_LONG, communicator);
    PROFILE_STOP("allToAllv<unsigned long>", profile_level);
}
// long int
template <>
void MPI_CLASS::call_allToAll<long int>(
    const long int *send_data, const int send_cnt[], const int send_disp[],
    long int *recv_data, const int *recv_cnt, const int *recv_disp) const {
    PROFILE_START("allToAllv<long int>", profile_level);
    MPI_Alltoallv((void *)send_data, (int *)send_cnt, (int *)send_disp,
                  MPI_LONG, (void *)recv_data, (int *)recv_cnt,
                  (int *)recv_disp, MPI_LONG, communicator);
    PROFILE_STOP("allToAllv<long int>", profile_level);
}
// float
template <>
void MPI_CLASS::call_allToAll<float>(const float *send_data,
                                     const int send_cnt[],
                                     const int send_disp[], float *recv_data,
                                     const int *recv_cnt,
                                     const int *recv_disp) const {
    PROFILE_START("allToAllv<float>", profile_level);
    MPI_Alltoallv((void *)send_data, (int *)send_cnt, (int *)send_disp,
                  MPI_FLOAT, (void *)recv_data, (int *)recv_cnt,
                  (int *)recv_disp, MPI_FLOAT, communicator);
    PROFILE_STOP("allToAllv<float>", profile_level);
}
// double
template <>
void MPI_CLASS::call_allToAll<double>(const double *send_data,
                                      const int send_cnt[],
                                      const int send_disp[], double *recv_data,
                                      const int *recv_cnt,
                                      const int *recv_disp) const {
    PROFILE_START("allToAllv<double>", profile_level);
    MPI_Alltoallv((void *)send_data, (int *)send_cnt, (int *)send_disp,
                  MPI_DOUBLE, (void *)recv_data, (int *)recv_cnt,
                  (int *)recv_disp, MPI_DOUBLE, communicator);
    PROFILE_STOP("allToAllv<double>", profile_level);
}
#else
// Default instatiation of unsigned char
template <>
void MPI_CLASS::call_allToAll<char>(const char *, const int[], const int[],
                                    char *, const int *, const int *) const {
    MPI_ERROR("Should not reach this point");
}
#endif

/************************************************************************
 *  call_sumScan                                                         *
 *  Note: these specializations are only called when using MPI.          *
 ************************************************************************/
#ifdef USE_MPI
// unsigned char
template <>
void MPI_CLASS::call_sumScan<unsigned char>(const unsigned char *send,
                                            unsigned char *recv, int n) const {
    PROFILE_START("sumScan<unsigned char>", profile_level);
    MPI_Scan((void *)send, (void *)recv, n, MPI_UNSIGNED_CHAR, MPI_SUM,
             communicator);
    PROFILE_STOP("sumScan<unsigned char>", profile_level);
}
// char
template <>
void MPI_CLASS::call_sumScan<char>(const char *send, char *recv, int n) const {
    PROFILE_START("sumScan<char>", profile_level);
    MPI_Scan((void *)send, (void *)recv, n, MPI_SIGNED_CHAR, MPI_SUM,
             communicator);
    PROFILE_STOP("sumScan<char>", profile_level);
}
// unsigned int
template <>
void MPI_CLASS::call_sumScan<unsigned int>(const unsigned int *send,
                                           unsigned int *recv, int n) const {
    PROFILE_START("sumScan<unsigned int>", profile_level);
    MPI_Scan((void *)send, (void *)recv, n, MPI_UNSIGNED, MPI_SUM,
             communicator);
    PROFILE_STOP("sumScan<unsigned int>", profile_level);
}
// int
template <>
void MPI_CLASS::call_sumScan<int>(const int *send, int *recv, int n) const {
    PROFILE_START("sumScan<int>", profile_level);
    MPI_Scan((void *)send, (void *)recv, n, MPI_INT, MPI_SUM, communicator);
    PROFILE_STOP("sumScan<int>", profile_level);
}
// long int
template <>
void MPI_CLASS::call_sumScan<long int>(const long int *send, long int *recv,
                                       int n) const {
    PROFILE_START("sumScan<long int>", profile_level);
    MPI_Scan((void *)send, (void *)recv, n, MPI_LONG, MPI_SUM, communicator);
    PROFILE_STOP("sumScan<long int>", profile_level);
}
// unsigned long int
template <>
void MPI_CLASS::call_sumScan<unsigned long>(const unsigned long *send,
                                            unsigned long *recv, int n) const {
    PROFILE_START("sumScan<unsigned long>", profile_level);
    MPI_Scan((void *)send, (void *)recv, n, MPI_UNSIGNED_LONG, MPI_SUM,
             communicator);
    PROFILE_STOP("sumScan<unsigned long>", profile_level);
}
// size_t
#ifdef USE_WINDOWS
template <>
void MPI_CLASS::call_sumScan<size_t>(const size_t *send, size_t *recv,
                                     int n) const {
    MPI_ASSERT(MPI_SIZE_T != 0);
    PROFILE_START("sumScan<size_t>", profile_level);
    MPI_Scan((void *)send, (void *)recv, n, MPI_SIZE_T, MPI_SUM, communicator);
    PROFILE_STOP("sumScan<size_t>", profile_level);
}
#endif
// float
template <>
void MPI_CLASS::call_sumScan<float>(const float *send, float *recv,
                                    int n) const {
    PROFILE_START("sumScan<float>", profile_level);
    MPI_Scan((void *)send, (void *)recv, n, MPI_FLOAT, MPI_SUM, communicator);
    PROFILE_STOP("sumScan<float>", profile_level);
}
// double
template <>
void MPI_CLASS::call_sumScan<double>(const double *send, double *recv,
                                     int n) const {
    PROFILE_START("sumScan<double>", profile_level);
    MPI_Scan((void *)send, (void *)recv, n, MPI_DOUBLE, MPI_SUM, communicator);
    PROFILE_STOP("sumScan<double>", profile_level);
}
// std::complex<double>
template <>
void MPI_CLASS::call_sumScan<std::complex<double>>(
    const std::complex<double> *x, std::complex<double> *y, int n) const {
    auto send = new double[2 * n];
    auto recv = new double[2 * n];
    for (int i = 0; i < n; i++) {
        send[2 * i + 0] = real(x[i]);
        send[2 * i + 1] = imag(x[i]);
    }
    MPI_Scan((void *)send, (void *)recv, 2 * n, MPI_DOUBLE, MPI_SUM,
             communicator);
    for (int i = 0; i < n; i++)
        y[i] = std::complex<double>(recv[2 * i + 0], recv[2 * i + 1]);
    delete[] send;
    delete[] recv;
}
#endif

/************************************************************************
 *  call_minScan                                                         *
 *  Note: these specializations are only called when using MPI.          *
 ************************************************************************/
#ifdef USE_MPI
// unsigned char
template <>
void MPI_CLASS::call_minScan<unsigned char>(const unsigned char *send,
                                            unsigned char *recv, int n) const {
    PROFILE_START("minScan<unsigned char>", profile_level);
    MPI_Scan((void *)send, (void *)recv, n, MPI_UNSIGNED_CHAR, MPI_MIN,
             communicator);
    PROFILE_STOP("minScan<unsigned char>", profile_level);
}
// char
template <>
void MPI_CLASS::call_minScan<char>(const char *send, char *recv, int n) const {
    PROFILE_START("minScan<char>", profile_level);
    MPI_Scan((void *)send, (void *)recv, n, MPI_SIGNED_CHAR, MPI_MIN,
             communicator);
    PROFILE_STOP("minScan<char>", profile_level);
}
// unsigned int
template <>
void MPI_CLASS::call_minScan<unsigned int>(const unsigned int *send,
                                           unsigned int *recv, int n) const {
    PROFILE_START("minScan<unsigned int>", profile_level);
    MPI_Scan((void *)send, (void *)recv, n, MPI_UNSIGNED, MPI_MIN,
             communicator);
    PROFILE_STOP("minScan<unsigned int>", profile_level);
}
// int
template <>
void MPI_CLASS::call_minScan<int>(const int *send, int *recv, int n) const {
    PROFILE_START("minScan<int>", profile_level);
    MPI_Scan((void *)send, (void *)recv, n, MPI_INT, MPI_MIN, communicator);
    PROFILE_STOP("minScan<int>", profile_level);
}
// unsigned long int
template <>
void MPI_CLASS::call_minScan<unsigned long int>(const unsigned long int *send,
                                                unsigned long int *recv,
                                                int n) const {
    PROFILE_START("minScan<unsigned long>", profile_level);
    MPI_Scan((void *)send, (void *)recv, n, MPI_UNSIGNED_LONG, MPI_MIN,
             communicator);
    PROFILE_STOP("minScan<unsigned long>", profile_level);
}
// long int
template <>
void MPI_CLASS::call_minScan<long int>(const long int *send, long int *recv,
                                       int n) const {
    PROFILE_START("minScan<long int>", profile_level);
    MPI_Scan((void *)send, (void *)recv, n, MPI_LONG, MPI_MIN, communicator);
    PROFILE_STOP("minScan<long int>", profile_level);
}
// size_t
#ifdef USE_WINDOWS
template <>
void MPI_CLASS::call_minScan<size_t>(const size_t *send, size_t *recv,
                                     int n) const {
    MPI_ASSERT(MPI_SIZE_T != 0);
    PROFILE_START("minScan<size_t>", profile_level);
    MPI_Scan((void *)send, (void *)recv, n, MPI_SIZE_T, MPI_MIN, communicator);
    PROFILE_STOP("minScan<size_t>", profile_level);
}
#endif
// float
template <>
void MPI_CLASS::call_minScan<float>(const float *send, float *recv,
                                    int n) const {
    PROFILE_START("minScan<float>", profile_level);
    MPI_Scan((void *)send, (void *)recv, n, MPI_FLOAT, MPI_MIN, communicator);
    PROFILE_STOP("minScan<float>", profile_level);
}
// double
template <>
void MPI_CLASS::call_minScan<double>(const double *send, double *recv,
                                     int n) const {
    PROFILE_START("minScan<double>", profile_level);
    MPI_Scan((void *)send, (void *)recv, n, MPI_DOUBLE, MPI_MIN, communicator);
    PROFILE_STOP("minScan<double>", profile_level);
}
#endif

/************************************************************************
 *  call_maxScan                                                         *
 *  Note: these specializations are only called when using MPI.          *
 ************************************************************************/
#ifdef USE_MPI
// unsigned char
template <>
void MPI_CLASS::call_maxScan<unsigned char>(const unsigned char *send,
                                            unsigned char *recv, int n) const {
    PROFILE_START("maxScan<unsigned char>", profile_level);
    MPI_Scan((void *)send, (void *)recv, n, MPI_UNSIGNED_CHAR, MPI_MAX,
             communicator);
    PROFILE_STOP("maxScan<unsigned char>", profile_level);
}
// char
template <>
void MPI_CLASS::call_maxScan<char>(const char *send, char *recv, int n) const {
    PROFILE_START("maxScan<char>", profile_level);
    MPI_Scan((void *)send, (void *)recv, n, MPI_SIGNED_CHAR, MPI_MAX,
             communicator);
    PROFILE_STOP("maxScan<char>", profile_level);
}
// unsigned int
template <>
void MPI_CLASS::call_maxScan<unsigned int>(const unsigned int *send,
                                           unsigned int *recv, int n) const {
    PROFILE_START("maxScan<unsigned int>", profile_level);
    MPI_Scan((void *)send, (void *)recv, n, MPI_UNSIGNED, MPI_MAX,
             communicator);
    PROFILE_STOP("maxScan<unsigned int>", profile_level);
}
// int
template <>
void MPI_CLASS::call_maxScan<int>(const int *send, int *recv, int n) const {
    PROFILE_START("maxScan<int>", profile_level);
    MPI_Scan((void *)send, (void *)recv, n, MPI_INT, MPI_MAX, communicator);
    PROFILE_STOP("maxScan<int>", profile_level);
}
// long int
template <>
void MPI_CLASS::call_maxScan<long int>(const long int *send, long int *recv,
                                       int n) const {
    PROFILE_START("maxScan<long int>", profile_level);
    MPI_Scan((void *)send, (void *)recv, n, MPI_LONG, MPI_MAX, communicator);
    PROFILE_STOP("maxScan<long int>", profile_level);
}
// unsigned long int
template <>
void MPI_CLASS::call_maxScan<unsigned long int>(const unsigned long int *send,
                                                unsigned long int *recv,
                                                int n) const {
    PROFILE_START("maxScan<unsigned long>", profile_level);
    MPI_Scan((void *)send, (void *)recv, n, MPI_UNSIGNED_LONG, MPI_MAX,
             communicator);
    PROFILE_STOP("maxScan<unsigned long>", profile_level);
}
// size_t
#ifdef USE_WINDOWS
template <>
void MPI_CLASS::call_maxScan<size_t>(const size_t *send, size_t *recv,
                                     int n) const {
    MPI_ASSERT(MPI_SIZE_T != 0);
    PROFILE_START("maxScan<size_t>", profile_level);
    MPI_Scan((void *)send, (void *)recv, n, MPI_SIZE_T, MPI_MAX, communicator);
    PROFILE_STOP("maxScan<size_t>", profile_level);
}
#endif
// float
template <>
void MPI_CLASS::call_maxScan<float>(const float *send, float *recv,
                                    int n) const {
    PROFILE_START("maxScan<float>", profile_level);
    MPI_Scan((void *)send, (void *)recv, n, MPI_INT, MPI_MAX, communicator);
    PROFILE_STOP("maxScan<float>", profile_level);
}
// double
template <>
void MPI_CLASS::call_maxScan<double>(const double *send, double *recv,
                                     int n) const {
    PROFILE_START("maxScan<double>", profile_level);
    MPI_Scan((void *)send, (void *)recv, n, MPI_DOUBLE, MPI_MAX, communicator);
    PROFILE_STOP("maxScan<double>", profile_level);
}
#endif

/************************************************************************
 *  Communicate ranks for communication                                  *
 ************************************************************************/
std::vector<int> MPI_CLASS::commRanks(const std::vector<int> &ranks) const {
#ifdef USE_MPI
    // Get a byte array with the ranks to communicate
    auto data1 = new char[comm_size];
    auto data2 = new char[comm_size];
    memset(data1, 0, comm_size);
    memset(data2, 0, comm_size);
    for (auto &rank : ranks)
        data1[rank] = 1;
    MPI_Alltoall(data1, 1, MPI_CHAR, data2, 1, MPI_CHAR, communicator);
    int N = 0;
    for (int i = 0; i < comm_size; i++)
        N += data2[i];
    std::vector<int> ranks_out;
    ranks_out.reserve(N);
    for (int i = 0; i < comm_size; i++) {
        if (data2[i])
            ranks_out.push_back(i);
    }
    delete[] data1;
    delete[] data2;
    return ranks_out;
#else
    return ranks;
#endif
}

/************************************************************************
 *  Wait functions                                                       *
 ************************************************************************/
#ifdef USE_MPI
void MPI_CLASS::wait(MPI_Request request) {
    PROFILE_START("wait", profile_level);
    MPI_Status status;
    MPI_Wait(&request, &status);
    /*int flag = 0;
    int err  = MPI_Test( &request, &flag, &status );
    MPI_ASSERT( err == MPI_SUCCESS ); // Check that the first call is valid
    while ( !flag ) {
        // Put the current thread to sleep to allow other threads to run
        sched_yield();
        // Check if the request has finished
        MPI_Test( &request, &flag, &status );
    }*/
    PROFILE_STOP("wait", profile_level);
}
int MPI_CLASS::waitAny(int count, MPI_Request *request) {
    if (count == 0)
        return -1;
    PROFILE_START("waitAny", profile_level);
    int index = -1;
    auto status = new MPI_Status[count];
    MPI_Waitany(count, request, &index, status);
    /*int flag    = 0;
    int err     = MPI_Testany( count, request, &index, &flag, status );
    MPI_ASSERT( err == MPI_SUCCESS ); // Check that the first call is valid
    while ( !flag ) {
        // Put the current thread to sleep to allow other threads to run
        sched_yield();
        // Check if the request has finished
        MPI_Testany( count, request, &index, &flag, status );
    }
    MPI_ASSERT( index >= 0 ); // Check that the index is valid*/
    delete[] status;
    PROFILE_STOP("waitAny", profile_level);
    return index;
}
void MPI_CLASS::waitAll(int count, MPI_Request *request) {
    if (count == 0)
        return;
    PROFILE_START("waitAll", profile_level);
    auto status = new MPI_Status[count];
    MPI_Waitall(count, request, status);
    /*int flag    = 0;
    int err     = MPI_Testall( count, request, &flag, status );
    MPI_ASSERT( err == MPI_SUCCESS ); // Check that the first call is valid
    while ( !flag ) {
        // Put the current thread to sleep to allow other threads to run
        sched_yield();
        // Check if the request has finished
        MPI_Testall( count, request, &flag, status );
    }*/
    PROFILE_STOP("waitAll", profile_level);
    delete[] status;
}
std::vector<int> MPI_CLASS::waitSome(int count, MPI_Request *request) {
    if (count == 0)
        return std::vector<int>();
    PROFILE_START("waitSome", profile_level);
    std::vector<int> indicies(count, -1);
    auto *status = new MPI_Status[count];
    int outcount = 0;
    MPI_Waitsome(count, request, &outcount, indicies.data(), status);
    /*int err      = MPI_Testsome( count, request, &outcount, &indicies[0], status );
    MPI_ASSERT( err == MPI_SUCCESS );        // Check that the first call is valid
    MPI_ASSERT( outcount != MPI_UNDEFINED ); // Check that the first call is valid
    while ( outcount == 0 ) {
        // Put the current thread to sleep to allow other threads to run
        sched_yield();
        // Check if the request has finished
        MPI_Testsome( count, request, &outcount, &indicies[0], status );
    }*/
    indicies.resize(outcount);
    delete[] status;
    PROFILE_STOP("waitSome", profile_level);
    return indicies;
}
#else
void MPI_CLASS::wait(MPI_Request request) {
    PROFILE_START("wait", profile_level);
    while (1) {
        // Check if the request is in our list
        if (global_isendrecv_list.find(request) == global_isendrecv_list.end())
            break;
        // Put the current thread to sleep to allow other threads to run
        sched_yield();
    }
    PROFILE_STOP("wait", profile_level);
}
int MPI_CLASS::waitAny(int count, MPI_Request *request) {
    if (count == 0)
        return -1;
    PROFILE_START("waitAny", profile_level);
    int index = 0;
    while (1) {
        // Check if the request is in our list
        bool found_any = false;
        for (int i = 0; i < count; i++) {
            if (global_isendrecv_list.find(request[i]) ==
                global_isendrecv_list.end()) {
                found_any = true;
                index = i;
            }
        }
        if (found_any)
            break;
        // Put the current thread to sleep to allow other threads to run
        sched_yield();
    }
    PROFILE_STOP("waitAny", profile_level);
    return index;
}
void MPI_CLASS::waitAll(int count, MPI_Request *request) {
    if (count == 0)
        return;
    PROFILE_START("waitAll", profile_level);
    while (1) {
        // Check if the request is in our list
        bool found_all = true;
        for (int i = 0; i < count; i++) {
            if (global_isendrecv_list.find(request[i]) !=
                global_isendrecv_list.end())
                found_all = false;
        }
        if (found_all)
            break;
        // Put the current thread to sleep to allow other threads to run
        sched_yield();
    }
    PROFILE_STOP("waitAll", profile_level);
}
std::vector<int> MPI_CLASS::waitSome(int count, MPI_Request *request) {
    if (count == 0)
        return std::vector<int>();
    PROFILE_START("waitSome", profile_level);
    std::vector<int> indicies;
    while (1) {
        // Check if the request is in our list
        for (int i = 0; i < count; i++) {
            if (global_isendrecv_list.find(request[i]) ==
                global_isendrecv_list.end())
                indicies.push_back(i);
        }
        if (!indicies.empty())
            break;
        // Put the current thread to sleep to allow other threads to run
        sched_yield();
    }
    PROFILE_STOP("waitSome", profile_level);
    return indicies;
}
#endif

/************************************************************************
 *  Probe functions                                                      *
 ************************************************************************/
#ifdef USE_MPI
int MPI_CLASS::Iprobe(int source, int tag) const {
    MPI_INSIST(tag <= d_maxTag, "Maximum tag value exceeded");
    MPI_INSIST(tag >= 0, "tag must be >= 0");
    MPI_Status status;
    int flag = 0;
    MPI_Iprobe(source, tag, communicator, &flag, &status);
    if (flag == 0)
        return -1;
    int count;
    MPI_Get_count(&status, MPI_BYTE, &count);
    MPI_ASSERT(count >= 0);
    return count;
}
int MPI_CLASS::probe(int source, int tag) const {
    MPI_INSIST(tag <= d_maxTag, "Maximum tag value exceeded");
    MPI_INSIST(tag >= 0, "tag must be >= 0");
    MPI_Status status;
    MPI_Probe(source, tag, communicator, &status);
    int count;
    MPI_Get_count(&status, MPI_BYTE, &count);
    MPI_ASSERT(count >= 0);
    return count;
}
#else
int MPI_CLASS::Iprobe(int, int) const {
    MPI_ERROR("Not implimented for serial codes (Iprobe)");
    return 0;
}
int MPI_CLASS::probe(int, int) const {
    MPI_ERROR("Not implimented for serial codes (probe)");
    return 0;
}
#endif

/************************************************************************
 *  Timer functions                                                      *
 ************************************************************************/
#ifdef USE_MPI
double MPI_CLASS::time() { return MPI_Wtime(); }
double MPI_CLASS::tick() { return MPI_Wtick(); }
#else
double MPI_CLASS::time() {
    auto t = std::chrono::system_clock::now();
    auto ns = std::chrono::duration_cast<std::chrono::nanoseconds>(
        t.time_since_epoch());
    return 1e-9 * ns.count();
}
double MPI_CLASS::tick() {
    auto period = std::chrono::system_clock::period();
    return static_cast<double>(period.num) / static_cast<double>(period.den);
}
#endif

/************************************************************************
 *  Serialize a block of code across MPI processes                       *
 ************************************************************************/
void MPI_CLASS::serializeStart() {
#ifdef USE_MPI
    using namespace std::chrono_literals;
    if (comm_rank == 0) {
        // Start rank 0 immediately
    } else {
        // Wait for a message from the previous rank
        MPI_Request request;
        MPI_Status status;
        int flag = false, buf = 0;
        MPI_Irecv(&buf, 1, MPI_INT, comm_rank - 1, 5627, MPI_COMM_WORLD,
                  &request);
        while (!flag) {
            MPI_Test(&request, &flag, &status);
            std::this_thread::sleep_for(50ms);
        }
    }
#endif
}
void MPI_CLASS::serializeStop() {
#ifdef USE_MPI
    using namespace std::chrono_literals;
    if (comm_rank < comm_size - 1) {
        // Send flag to next rank
        MPI_Send(&comm_rank, 1, MPI_INT, comm_rank + 1, 5627, MPI_COMM_WORLD);
        // Wait for final finished flag
        int flag = false, buf = 0;
        MPI_Request request;
        MPI_Status status;
        MPI_Irecv(&buf, 1, MPI_INT, comm_size - 1, 5627, MPI_COMM_WORLD,
                  &request);
        while (!flag) {
            MPI_Test(&request, &flag, &status);
            std::this_thread::sleep_for(50ms);
        }
    } else {
        // Send final flag to all ranks
        for (int i = 0; i < comm_size - 1; i++)
            MPI_Send(&comm_rank, 1, MPI_INT, i, 5627, MPI_COMM_WORLD);
    }
#endif
}

/****************************************************************************
 * Function to start/stop MPI                                                *
 ****************************************************************************/
#ifdef USE_MPI
static bool called_MPI_Init = false;
#endif
bool MPI_CLASS::MPI_Active() {
#ifdef USE_MPI
    int MPI_initialized, MPI_finialized;
    MPI_Initialized(&MPI_initialized);
    MPI_Finalized(&MPI_finialized);
    return MPI_initialized != 0 && MPI_finialized == 0;
#else
    return false;
#endif
}
void MPI_CLASS::start_MPI(int argc, char *argv[], int profile_level) {
    changeProfileLevel(profile_level);
    NULL_USE(argc);
    NULL_USE(argv);
#ifdef USE_MPI
    if (MPI_Active()) {
        called_MPI_Init = false;
    } else {
        int provided;
        int result =
            MPI_Init_thread(&argc, &argv, MPI_THREAD_MULTIPLE, &provided);
        if (result != MPI_SUCCESS)
            MPI_ERROR("Unable to initialize MPI");
        if (provided < MPI_THREAD_MULTIPLE)
            std::cerr
                << "Warning: Failed to start MPI with MPI_THREAD_MULTIPLE\n";
        called_MPI_Init = true;
    }
#endif
}
void MPI_CLASS::stop_MPI() {
#ifdef USE_MPI
    int finalized;
    MPI_Finalized(&finalized);
    if (called_MPI_Init && !finalized) {
        MPI_Barrier(MPI_COMM_WORLD);
        MPI_Finalize();
        called_MPI_Init = true;
    }
#endif
}

/****************************************************************************
 * Function to perform load balancing                                        *
 ****************************************************************************/
MPI MPI::loadBalance(double local, std::vector<double> work) {
    MPI_ASSERT((int)work.size() == getSize());
    auto perf = allGather(local);
    std::vector<int> I(work.size());
    for (size_t i = 0; i < work.size(); i++)
        I[i] = i;
    auto J = I;
    quicksort(perf, I);
    quicksort(work, J);
    std::vector<int> key(work.size());
    for (size_t i = 0; i < work.size(); i++)
        key[J[i]] = I[i];
    return split(0, key[getRank()]);
}

/****************************************************************************
 * Function Persistent Communication                                         *
 ****************************************************************************/
template <>
std::shared_ptr<MPI_Request> MPI::Isend_init<double>(const double *buf, int N,
                                                     int proc, int tag) const {
    std::shared_ptr<MPI_Request> obj(new MPI_Request, [](MPI_Request *req) {
        MPI_Request_free(req);
        delete req;
    });
    MPI_Send_init(buf, N, MPI_DOUBLE, proc, tag, communicator, obj.get());
    return obj;
}
template <>
std::shared_ptr<MPI_Request> MPI::Irecv_init<double>(double *buf, int N,
                                                     int proc, int tag) const {
    std::shared_ptr<MPI_Request> obj(new MPI_Request, [](MPI_Request *req) {
        MPI_Request_free(req);
        delete req;
    });
    MPI_Recv_init(buf, N, MPI_DOUBLE, proc, tag, communicator, obj.get());
    return obj;
}
void MPI::Start(MPI_Request &request) { MPI_Start(&request); }

} // namespace Utilities