#include #include #include #include #include #include #include #include "common/MPI.h" #include "common/UnitTest.h" #include "common/Utilities.h" #include "common/Utilities.hpp" #include "common/ScaLBL.h" #include "ProfilerApp.h" #if defined( WIN32 ) || defined( _WIN32 ) || defined( WIN64 ) || defined( _WIN64 ) #include #define sched_yield() Sleep( 0 ) #else #include #endif #undef MPI_CLASS #define MPI_CLASS Utilities::MPI #define MPI_ASSERT ASSERT // Return the time elapsed in seconds static inline double time() { return MPI_CLASS::time(); } struct mytype { int a; double b; mytype() { a = -1; b = -1.0; } mytype( int i ) { a = i; b = -1.0; } mytype( int i, double d ) { a = i; b = d; } bool operator==( const mytype &other ) { if ( a == other.a && b == other.b ) return true; return false; } bool operator!=( const mytype &other ) { if ( a != other.a || b != other.b ) return true; return false; } }; // Routines to test Reduce with known data types // flag - 0: all tests should pass // 1: basic reduce should pass, reduce with rank should fail with error message template int testReduce( MPI_CLASS comm, UnitTest *ut, int flag ); template<> int testReduce>( MPI_CLASS comm, UnitTest *ut, int ) { PROFILE_START( "testReduce" ); char message[128]; std::complex rank = comm.getRank() + 1; std::complex N = ( ( comm.getSize() * ( comm.getSize() + 1 ) ) / 2 ); // Test sumReduce sprintf( message, "sumReduce (%s)", typeid( std::complex ).name() ); if ( comm.sumReduce>( rank ) == N ) ut->passes( message ); else ut->failure( message ); sprintf( message, "sumReduce (%s) (x,y)", typeid( std::complex ).name() ); std::complex y; comm.sumReduce>( &rank, &y, 1 ); if ( y == N ) ut->passes( message ); else ut->failure( message ); PROFILE_STOP( "testReduce" ); return 2; // Return the number of tests } template int testReduce( MPI_CLASS comm, UnitTest *ut, int flag ) { PROFILE_START( "testReduce" ); char message[128]; auto rank = (type) comm.getRank(); auto size = (type) comm.getSize(); if ( (int) ( size ) != comm.getSize() ) { sprintf( message, "Reduce (%s) cannot represent the number of processors", typeid( type ).name() ); ut->expected_failure( message ); PROFILE_STOP2( "testReduce" ); return 0; } type x, y; int N = ( ( comm.getSize() * ( comm.getSize() + 1 ) ) / 2 ); // Test sumReduce sprintf( message, "sumReduce (%s)", typeid( type ).name() ); if ( ( (int) ( (type) N ) ) != N ) ut->expected_failure( message ); // type cannot represent N else if ( comm.sumReduce( rank + 1 ) == (type) N ) ut->passes( message ); else ut->failure( message ); sprintf( message, "sumReduce (%s) (x,y)", typeid( type ).name() ); x = rank + 1; comm.sumReduce( &x, &y, 1 ); if ( ( (int) ( (type) N ) ) != N ) ut->expected_failure( message ); else if ( y == (type) N ) ut->passes( message ); else ut->failure( message ); // Test minReduce sprintf( message, "minReduce (%s)", typeid( type ).name() ); if ( comm.minReduce( rank + 1 ) == 1 ) ut->passes( message ); else ut->failure( message ); sprintf( message, "minReduce (%s) (x,y)", typeid( type ).name() ); comm.minReduce( &x, &y, 1, nullptr ); if ( y == 1 ) ut->passes( message ); else ut->failure( message ); // Test maxReduce sprintf( message, "maxReduce (%s)", typeid( type ).name() ); if ( comm.maxReduce( rank + 1 ) == size ) ut->passes( message ); else ut->failure( message ); sprintf( message, "maxReduce (%s) (x,y)", typeid( type ).name() ); comm.maxReduce( &x, &y, 1, nullptr ); if ( y == size ) ut->passes( message ); else ut->failure( message ); // Test minReduce with rank int rank_of_min = -1; int rank_of_max = -1; type rank_min = rank + 1; type rank_max = rank + 1; sprintf( message, "minReduce-rank (%s)", typeid( type ).name() ); try { comm.minReduce( &rank_min, 1, &rank_of_min ); if ( rank_min == 1 && rank_of_min == 0 ) ut->passes( message ); else ut->failure( message ); if ( flag == 1 && comm.getSize() > 1 ) ut->failure( message ); } catch ( ... ) { if ( flag == 1 && comm.getSize() > 1 ) ut->expected_failure( message ); else ut->failure( message ); } sprintf( message, "minReduce-rank (%s) (x,y)", typeid( type ).name() ); try { comm.minReduce( &x, &rank_min, 1, &rank_of_min ); if ( rank_min == 1 && rank_of_min == 0 ) ut->passes( message ); else ut->failure( message ); if ( flag == 1 && comm.getSize() > 1 ) ut->failure( message ); } catch ( ... ) { if ( flag == 1 && comm.getSize() > 1 ) ut->expected_failure( message ); else ut->failure( message ); } // Test maxReduce with rank sprintf( message, "maxReduce-rank (%s)", typeid( type ).name() ); try { comm.maxReduce( &rank_max, 1, &rank_of_max ); if ( rank_max == size && rank_of_max == comm.getSize() - 1 ) ut->passes( message ); else ut->failure( message ); if ( flag == 1 && comm.getSize() > 1 ) ut->failure( message ); } catch ( ... ) { if ( flag == 1 && comm.getSize() > 1 ) ut->expected_failure( message ); else ut->failure( message ); } sprintf( message, "maxReduce-rank (%s) (x,y)", typeid( type ).name() ); try { comm.maxReduce( &x, &rank_max, 1, &rank_of_max ); if ( rank_max == size && rank_of_max == comm.getSize() - 1 ) ut->passes( message ); else ut->failure( message ); if ( flag == 1 && comm.getSize() > 1 ) ut->failure( message ); } catch ( ... ) { if ( flag == 1 && comm.getSize() > 1 ) ut->expected_failure( message ); else ut->failure( message ); } PROFILE_STOP( "testReduce" ); return 10; // Return the number of tests } // Routine to test Scan with known data types // flag - 0: all tests should pass // 1: only sumScan is valid (complex) template int testScan( MPI_CLASS comm, UnitTest *ut, int flag = 0 ) { PROFILE_START( "testScan" ); char message[500]; auto x = ( type )( comm.getRank() + 1 ); type y; sprintf( message, "sumScan (%s)", typeid( type ).name() ); comm.sumScan( &x, &y, 1 ); auto N = ( type )( ( ( comm.getRank() + 1 ) * ( comm.getRank() + 2 ) ) / 2 ); if ( y == N ) ut->passes( message ); else ut->failure( message ); if ( flag == 1 ) { PROFILE_STOP2( "testScan" ); return 1; } sprintf( message, "minScan (%s)", typeid( type ).name() ); comm.minScan( &x, &y, 1 ); if ( y == (type) 1 ) ut->passes( message ); else ut->failure( message ); sprintf( message, "maxScan (%s)", typeid( type ).name() ); comm.maxScan( &x, &y, 1 ); if ( y == x ) ut->passes( message ); else ut->failure( message ); PROFILE_STOP( "testScan" ); return 3; // Return the number of tests } // Routine to test bcast template int testBcast( MPI_CLASS comm, UnitTest *ut, type default_val, type new_val ) { PROFILE_START( "testBcast" ); char message[128]; for ( int i = 0; i < comm.getSize(); i++ ) { type tmp1 = default_val; if ( comm.getRank() == i ) tmp1 = new_val; sprintf( message, "bcast scalar (%s) from rank %i", typeid( type ).name(), i ); if ( comm.bcast( tmp1, i ) == new_val ) ut->passes( message ); else ut->failure( message ); type tmp2[2]; tmp2[0] = default_val; tmp2[1] = default_val; if ( comm.getRank() == i ) { tmp2[0] = new_val; tmp2[1] = new_val; } sprintf( message, "bcast vector (%s) from rank %i", typeid( type ).name(), i ); comm.bcast( tmp2, 2, i ); if ( tmp2[0] == new_val && tmp2[1] == new_val ) ut->passes( message ); else ut->failure( message ); } PROFILE_STOP( "testBcast" ); return 2 * comm.getSize(); // Return the number of tests } // Routine to test allGather template int testAllGather( MPI_CLASS comm, UnitTest *ut ) { PROFILE_START( "testAllGather" ); char message[128]; // Test scalar allGather auto x1 = (type) comm.getRank(); auto *x2 = new type[comm.getSize()]; comm.allGather( x1, x2 ); bool pass = true; for ( int i = 0; i < comm.getSize(); i++ ) { type test = i; if ( x2[i] != test ) pass = false; } sprintf( message, "allGather scalar (%s)", typeid( type ).name() ); if ( pass ) ut->passes( message ); else ut->failure( message ); // Test vector allGather int N = ( comm.getSize() * ( comm.getSize() + 1 ) ) / 2; auto *x3 = new type[comm.getRank() + 1]; auto *x4 = new type[N]; auto *x5 = new type[N]; int *size = new int[comm.getSize()]; for ( int i = 0; i <= comm.getRank(); i++ ) x3[i] = (type) comm.getRank(); int tot1 = comm.allGather( x3, comm.getRank() + 1, x4 ); int tot2 = comm.allGather( x3, comm.getRank() + 1, x5, size ); pass = true; if ( tot1 != N || tot2 != N ) pass = false; int k = 0; for ( int i = 0; i < comm.getSize(); i++ ) { if ( size[i] != i + 1 ) pass = false; if ( !pass ) break; for ( int j = 0; j <= i; j++ ) { type test = i; if ( x4[k] != test || x5[k] != test ) pass = false; k++; } } sprintf( message, "allGather vector (%s)", typeid( type ).name() ); if ( pass ) ut->passes( message ); else ut->failure( message ); delete[] x2; delete[] x3; delete[] x4; delete[] x5; delete[] size; // Test vector allGather with know recive sizes and non-zero displacements auto *send = new type[comm.getRank() + 1]; auto *recv = new type[comm.getSize() * comm.getSize() + 1]; auto *recv_size = new int[comm.getSize()]; auto *recv_disp = new int[comm.getSize()]; for ( int i = 0; i <= comm.getRank(); i++ ) send[i] = i; for ( int i = 0; i < comm.getSize(); i++ ) recv_size[i] = i + 1; for ( int i = 0; i < comm.getSize(); i++ ) recv_disp[i] = 1 + i * comm.getSize() + comm.getSize() - i - 1; for ( int i = 0; i <= comm.getSize() * comm.getSize(); i++ ) recv[i] = (type) -1; int tot = comm.allGather( send, comm.getRank() + 1, recv, recv_size, recv_disp, true ); pass = true; if ( tot != N ) pass = false; auto test = (type) -1; if ( recv[0] != test ) pass = false; for ( int i = 0; i < comm.getSize(); i++ ) { for ( int j = 0; j < comm.getSize(); j++ ) { int l = j + i * comm.getSize() + 1 - recv_disp[i]; if ( l >= 0 ) test = l; else test = (type) -1; if ( recv[j + i * comm.getSize() + 1] != test ) pass = false; } } sprintf( message, "allGather vector with known recv and non-zero displacements (%s)", typeid( type ).name() ); if ( pass ) ut->passes( message ); else ut->failure( message ); delete[] send; delete[] recv; delete[] recv_size; delete[] recv_disp; // Test vector allGather with no elements size = new int[comm.getSize()]; sprintf( message, "allGather scalar (%s)", typeid( type ).name() ); try { comm.allGather( &x1, 0, (type *) nullptr, size ); ut->passes( message ); } catch ( ... ) { ut->failure( message ); } delete[] size; PROFILE_STOP( "testAllGather" ); return 4; // Return the number of tests } // Routine to test setGather template int testSetGather( MPI_CLASS comm, UnitTest *ut ) { PROFILE_START( "testSetGather" ); char message[500]; auto x1 = (type) comm.getRank(); std::set set; set.insert( x1 ); comm.setGather( set ); bool pass = true; for ( int i = 0; i < comm.getSize(); i++ ) { type x2 = i; if ( set.find( x2 ) == set.end() ) pass = false; } sprintf( message, "setGather (%s)", typeid( type ).name() ); if ( pass ) ut->passes( message ); else ut->failure( message ); PROFILE_STOP( "testSetGather" ); return 1; // Return the number of tests } // Routine to test mapGather template int testMapGather( MPI_CLASS comm, UnitTest *ut ) { PROFILE_START( "testMapGather" ); char message[128]; auto x1 = (type) comm.getRank(); std::map map; map.insert( std::pair( comm.getRank(), x1 ) ); comm.mapGather( map ); bool pass = true; for ( int i = 0; i < comm.getSize(); i++ ) { type x2 = i; auto it = map.find( i ); if ( it == map.end() ) pass = false; else if ( it->second != x2 ) pass = false; } sprintf( message, "mapGather (%s)", typeid( type ).name() ); if ( pass ) ut->passes( message ); else ut->failure( message ); PROFILE_STOP( "testMapGather" ); return 1; // Return the number of tests } // Routine to test allToAll template int testAllToAll( MPI_CLASS comm, UnitTest *ut ) { PROFILE_START( "testAllToAll" ); bool pass; char message[128]; int size = 0; type *send_data, *recv_data; auto *send_cnt = new int[comm.getSize()]; auto *recv_cnt = new int[comm.getSize()]; auto *send_disp = new int[comm.getSize()]; auto *recv_disp = new int[comm.getSize()]; // Test allToAll with a scalar value to each processor send_data = new type[comm.getSize()]; recv_data = new type[comm.getSize()]; for ( int i = 0; i < comm.getSize(); i++ ) send_data[i] = comm.getSize(); comm.allToAll( 1, send_data, recv_data ); pass = true; for ( int i = 0; i < comm.getSize(); i++ ) { type test = comm.getSize(); if ( recv_data[i] != test ) pass = false; } delete[] send_data; delete[] recv_data; sprintf( message, "allToAll with scalar (%s)", typeid( type ).name() ); if ( pass ) ut->passes( message ); else ut->failure( message ); // Test allToAll vector with a scalar value to each processor send_data = new type[comm.getSize()]; recv_data = new type[comm.getSize()]; for ( int i = 0; i < comm.getSize(); i++ ) { send_cnt[i] = 1; recv_cnt[i] = 1; send_disp[i] = i; recv_disp[i] = i; send_data[i] = comm.getSize(); recv_data[i] = 0; } size = comm.allToAll( send_data, send_cnt, send_disp, recv_data, recv_cnt, recv_disp, true ); pass = true; if ( size != comm.getSize() ) pass = false; for ( int i = 0; i < comm.getSize(); i++ ) { type test = comm.getSize(); if ( recv_data[i] != test ) pass = false; } delete[] send_data; delete[] recv_data; sprintf( message, "allToAll vector with scalar (%s)", typeid( type ).name() ); if ( pass ) ut->passes( message ); else ut->failure( message ); // Test allToAll with a variable number of values per processor and spacing send_data = new type[comm.getSize() * comm.getSize()]; recv_data = new type[2 * comm.getRank() * comm.getSize()]; for ( int i = 0; i < comm.getSize(); i++ ) { send_cnt[i] = i; recv_cnt[i] = comm.getRank(); send_disp[i] = i * comm.getSize(); recv_disp[i] = 2 * i * comm.getRank(); for ( int j = 0; j < comm.getSize(); j++ ) { if ( j < i ) send_data[j + send_disp[i]] = i; else send_data[j + send_disp[i]] = (type) -1; } } for ( int i = 0; i < 2 * comm.getRank() * comm.getSize(); i++ ) recv_data[i] = (type) -2; size = comm.allToAll( send_data, send_cnt, send_disp, recv_data, recv_cnt, recv_disp, true ); pass = true; if ( size != comm.getRank() * comm.getSize() ) pass = false; for ( int i = 0; i < comm.getSize(); i++ ) { for ( int j = 0; j < 2 * comm.getRank(); j++ ) { if ( j < comm.getRank() ) { type test = comm.getRank(); if ( recv_data[j + recv_disp[i]] != test ) pass = false; } else { auto test = (type) -2; if ( recv_data[j + recv_disp[i]] != test ) pass = false; } } } delete[] send_data; delete[] recv_data; sprintf( message, "allToAll with vector of known size and displacements (%s)", typeid( type ).name() ); if ( pass ) ut->passes( message ); else ut->failure( message ); // Test allToAll with a unknown recieve length send_data = new type[comm.getSize() * comm.getSize()]; auto *recv_data1 = new type[comm.getSize() * comm.getSize()]; auto *recv_data2 = new type[comm.getSize() * comm.getSize()]; for ( int i = 0; i < comm.getSize(); i++ ) { send_cnt[i] = i; recv_cnt[i] = -1; send_disp[i] = i * comm.getSize(); recv_disp[i] = -1; for ( int j = 0; j < comm.getSize(); j++ ) { if ( j < i ) send_data[j + send_disp[i]] = i; else send_data[j + send_disp[i]] = (type) -1; } } for ( int i = 0; i < comm.getSize() * comm.getSize(); i++ ) { recv_data1[i] = (type) -2; recv_data2[i] = (type) -2; } int size1 = comm.allToAll( send_data, send_cnt, send_disp, recv_data1, recv_cnt, recv_disp, false ); int size2 = comm.allToAll( send_data, send_cnt, send_disp, recv_data2 ); bool pass1 = true; bool pass2 = true; if ( size1 != comm.getRank() * comm.getSize() ) pass1 = false; if ( size2 != comm.getRank() * comm.getSize() ) pass2 = false; for ( int i = 0; i < comm.getSize(); i++ ) { if ( recv_cnt[i] != comm.getRank() || recv_disp[i] != i * comm.getRank() ) pass1 = false; } for ( int i = 0; i < comm.getRank() * comm.getSize(); i++ ) { type test = comm.getRank(); if ( recv_data1[i] != test ) pass1 = false; if ( recv_data2[i] != test ) pass2 = false; } delete[] send_data; delete[] recv_data1; delete[] recv_data2; sprintf( message, "allToAll with vector of unknown size (%s)", typeid( type ).name() ); if ( pass1 ) ut->passes( message ); else ut->failure( message ); sprintf( message, "allToAll with vector of unknown size with NULL recv(%s)", typeid( type ).name() ); if ( pass2 ) ut->passes( message ); else ut->failure( message ); // Free temporary variables delete[] send_cnt; delete[] recv_cnt; delete[] send_disp; delete[] recv_disp; PROFILE_STOP( "testAllToAll" ); return 5; // Return the number of tests } // Routine to test send/recv template int testSendRecv( MPI_CLASS comm, UnitTest *ut, type v1, type v2 ) { PROFILE_START( "testSendRecv" ); char message[128]; // Test send-recv with a known length for ( int i = 0; i < comm.getSize(); i++ ) { for ( int j = 0; j < comm.getSize(); j++ ) { type x = v1; int tag = i + j * comm.getSize(); sprintf( message, "send-recv %i-%i known length (%s)", i, j, typeid( type ).name() ); if ( i == j ) { // We are not allowed to send/recieve from the same processor continue; } else if ( i == comm.getRank() ) { // We are sending x = v2; comm.send( &x, 1, j, tag ); } else if ( j == comm.getRank() ) { // We are recieving int size = 1; comm.recv( &x, size, i, false, tag ); if ( size == 1 && x == v2 ) ut->passes( message ); else ut->failure( message ); } } } // Test send-recv with an unknown length for ( int i = 0; i < comm.getSize(); i++ ) { for ( int j = 0; j < comm.getSize(); j++ ) { type x = v1; int tag = i + j * comm.getSize(); sprintf( message, "send-recv %i-%i unknown length (%s)", i, j, typeid( type ).name() ); if ( i == j ) { // We are not allowed to send/recieve from the same processor continue; } else if ( i == comm.getRank() ) { // We are sending x = v2; comm.send( &x, 1, j, tag ); } else if ( j == comm.getRank() ) { // We are recieving int size = 1; comm.recv( &x, size, i, true, tag ); if ( size == 1 && x == v2 ) ut->passes( message ); else ut->failure( message ); } } } // Test send-recv with an empty length for ( int i = 0; i < comm.getSize(); i++ ) { for ( int j = 0; j < comm.getSize(); j++ ) { type x = v1; int tag = i + j * comm.getSize(); sprintf( message, "send-recv %i-%i empty length (%s)", i, j, typeid( type ).name() ); if ( i == j ) { // We are not allowed to send/recieve from the same processor continue; } else if ( i == comm.getRank() ) { // We are sending x = v2; comm.send( &x, 0, j, tag ); } else if ( j == comm.getRank() ) { // We are recieving int size = comm.probe( i, tag ); comm.recv( &x, size, i, false, tag ); if ( size == 0 ) ut->passes( message ); else ut->failure( message ); } } } PROFILE_STOP( "testSendRecv" ); return 3 * comm.getSize() * comm.getSize(); // Return the number of tests } // Routine to test Isend/Irecv template int testIsendIrecv( MPI_CLASS comm, UnitTest *ut, type v1, type v2 ) { PROFILE_START( "testIsendIrecv" ); char message[128]; std::vector sendRequest; std::vector recvRequest; // Send all messages for ( int i = 0; i < comm.getSize(); i++ ) { // Check if the current rank is sending if ( i != comm.getRank() ) continue; for ( int j = 0; j < comm.getSize(); j++ ) { // Start a non-blocking send int tag = i + j * comm.getSize(); MPI_Request request = comm.Isend( &v1, 1, j, tag ); sendRequest.insert( sendRequest.begin(), request ); } } // Recv all messages auto *recv_buffer = new type[comm.getSize()]; for ( int i = 0; i < comm.getSize(); i++ ) recv_buffer[i] = v2; recv_buffer[comm.getRank()] = v1; for ( int j = 0; j < comm.getSize(); j++ ) { // Check if the current rank is recieving if ( j != comm.getRank() ) continue; for ( int i = 0; i < comm.getSize(); i++ ) { // Start a non-blocking recv int tag = i + j * comm.getSize(); MPI_Request request = comm.Irecv( &recv_buffer[i], 1, i, tag ); recvRequest.insert( recvRequest.begin(), request ); } } // Wait for all communications to finish MPI_CLASS::wait( sendRequest[0] ); sendRequest.erase( sendRequest.begin() + 0 ); while ( !sendRequest.empty() ) { int index = comm.waitAny( sendRequest.size(), &( sendRequest[0] ) ); sendRequest.erase( sendRequest.begin() + index ); } auto finished = MPI_CLASS::waitSome( recvRequest.size(), recvRequest.data() ); if ( !recvRequest.empty() ) { MPI_ASSERT( !finished.empty() ); for ( auto it = finished.rbegin(); it != finished.rend(); ++it ) recvRequest.erase( recvRequest.begin() + ( *it ) ); } if ( !recvRequest.empty() ) MPI_CLASS::waitAll( recvRequest.size(), &( recvRequest[0] ) ); Utilities::unique( finished ); // Check the recieved values bool pass = true; for ( int i = 0; i < comm.getSize(); i++ ) { if ( recv_buffer[i] != v1 ) pass = false; } sprintf( message, "Isend-Irecv (%s)", typeid( type ).name() ); if ( pass ) ut->passes( message ); else ut->failure( message ); delete[] recv_buffer; PROFILE_STOP( "testIsendIrecv" ); return comm.getSize() * comm.getSize(); // Return the number of tests } // Routine to test CommRanks int testCommRanks( MPI_CLASS comm, UnitTest *ut ) { std::vector neighbors; for ( int i = 0; i < comm.getSize(); i++ ) if ( ( i % 2 ) == 0 ) neighbors.push_back( i ); std::vector neighbors2 = comm.commRanks( neighbors ); bool pass = true; if ( comm.getRank() % 2 == 0 ) { pass = static_cast( neighbors2.size() ) == comm.getSize(); if ( pass ) { for ( int i = 0; i < comm.getSize(); i++ ) pass = pass && neighbors2[i] == i; } } else { pass = neighbors2.empty(); } auto ranks = comm.globalRanks(); pass = pass && (int) ranks.size() == comm.getSize(); for ( int rank : ranks ) pass = pass && rank >= 0; auto ranks2 = ranks; Utilities::unique( ranks2 ); pass = pass && ranks.size() == ranks2.size(); comm.barrier(); if ( pass ) ut->passes( "commRanks" ); else ut->failure( "commRanks" ); return 1; // Return the number of tests } // Structure to contain timer results struct testCommTimerResults { int N_reduce; int N_scan; int N_bcast; int N_allGather; int N_setGather; int N_mapGather; int N_allToAll; int N_sendRecv; int N_IsendIrecv; double t_reduce; double t_scan; double t_bcast; double t_allGather; double t_setGather; double t_mapGather; double t_allToAll; double t_sendRecv; double t_IsendIrecv; // Constructor testCommTimerResults() { N_reduce = 0; N_scan = 0; N_bcast = 0; N_allGather = 0; N_setGather = 0; N_mapGather = 0; N_allToAll = 0; N_sendRecv = 0; N_IsendIrecv = 0; t_reduce = 0.0; t_scan = 0.0; t_bcast = 0.0; t_allGather = 0.0; t_setGather = 0.0; t_mapGather = 0.0; t_allToAll = 0.0; t_sendRecv = 0.0; t_IsendIrecv = 0.0; } // Print the results void print() { printf( " Reduce: N = %5i, t_tot = %0.5e, t_avg = %6.1f us\n", N_reduce, t_reduce, 1e6 * t_reduce / N_reduce ); printf( " Scan: N = %5i, t_tot = %0.5e, t_avg = %6.1f us\n", N_scan, t_scan, 1e6 * t_scan / N_scan ); printf( " Bcast: N = %5i, t_tot = %0.5e, t_avg = %6.1f us\n", N_bcast, t_bcast, 1e6 * t_bcast / N_bcast ); printf( " allGather: N = %5i, t_tot = %0.5e, t_avg = %6.1f us\n", N_allGather, t_allGather, 1e6 * t_allGather / N_allGather ); printf( " allToAll: N = %5i, t_tot = %0.5e, t_avg = %6.1f us\n", N_allToAll, t_allToAll, 1e6 * t_allToAll / N_allToAll ); printf( " send-recv: N = %5i, t_tot = %0.5e, t_avg = %6.1f us\n", N_sendRecv, t_sendRecv, 1e6 * t_sendRecv / N_sendRecv ); printf( " Isend-Irecv: N = %5i, t_tot = %0.5e, t_avg = %6.1f us\n", N_IsendIrecv, t_IsendIrecv, 1e6 * t_IsendIrecv / N_IsendIrecv ); printf( " setGather: N = %5i, t_tot = %0.5e, t_avg = %6.1f us\n", N_setGather, t_setGather, 1e6 * t_setGather / N_setGather ); printf( " mapGather: N = %5i, t_tot = %0.5e, t_avg = %6.1f us\n", N_mapGather, t_mapGather, 1e6 * t_mapGather / N_mapGather ); } }; // This routine will test a single MPI communicator testCommTimerResults testComm( MPI_CLASS comm, UnitTest *ut ) { PROFILE_START( "testComm" ); testCommTimerResults timer; double start_time; // Test the tag int tag0 = comm.newTag(); MPI_CLASS comm2 = comm; MPI_CLASS comm3( comm ); bool pass = tag0 > 0 && tag0 < comm.maxTag(); for ( int i = 1; i < 64; i++ ) { if ( comm.newTag() != tag0 + i ) pass = false; } for ( int i = 1; i <= 64; i++ ) { if ( comm2.newTag() != tag0 + 63 + i ) pass = false; } for ( int i = 1; i <= 128; i++ ) { if ( comm3.newTag() != tag0 + 127 + i ) pass = false; } if ( pass ) ut->passes( "newTag" ); else ut->failure( "newTag" ); // Test all and any reduce bool test1 = !comm.allReduce( comm.getRank() != 0 ); bool test2 = comm.allReduce( true ); if ( test1 && test2 ) ut->passes( "allReduce" ); else ut->failure( "allReduce" ); test1 = comm.anyReduce( comm.getRank() == 0 ); test2 = !comm.anyReduce( false ); if ( test1 && test2 ) ut->passes( "anyReduce" ); else ut->failure( "anyReduce" ); // Test min, max, and sum reduce start_time = time(); timer.N_reduce += testReduce( comm, ut, 0 ); timer.N_reduce += testReduce( comm, ut, 0 ); timer.N_reduce += testReduce( comm, ut, 0 ); timer.N_reduce += testReduce( comm, ut, 0 ); timer.N_reduce += testReduce( comm, ut, 0 ); timer.N_reduce += testReduce( comm, ut, 0 ); timer.N_reduce += testReduce( comm, ut, 0 ); timer.N_reduce += testReduce( comm, ut, 0 ); timer.N_reduce += testReduce( comm, ut, 0 ); timer.N_reduce += testReduce>( comm, ut, 2 ); // only sumreduce is valid for complex numbers mytype tmp1( 1, -1.0 ); mytype tmp2; if ( comm.getSize() > 1 ) { // We can't perform a reduce on an unknown data type (this should throw an error) try { // This should fail tmp2 = comm.sumReduce( tmp1 ); ut->failure( "sumReduce should give an error with an unknown type" ); } catch ( ... ) { ut->passes( "sumReduce should give an error with an unknown type" ); } try { // This should fail tmp2 = comm.minReduce( tmp1 ); ut->failure( "minReduce should give an error with an unknown type" ); } catch ( ... ) { ut->passes( "minReduce should give an error with an unknown type" ); } try { // This should fail tmp2 = comm.maxReduce( tmp1 ); ut->failure( "maxReduce should give an error with an unknown type" ); } catch ( ... ) { ut->passes( "maxReduce should give an error with an unknown type" ); } timer.N_reduce += 3; } timer.t_reduce = time() - start_time; // Test min, max, and sum scan start_time = time(); timer.N_scan += testScan( comm, ut ); timer.N_scan += testScan( comm, ut ); timer.N_scan += testScan( comm, ut ); timer.N_scan += testScan( comm, ut ); timer.N_scan += testScan( comm, ut ); timer.N_scan += testScan( comm, ut ); timer.N_scan += testScan( comm, ut ); timer.N_scan += testScan( comm, ut ); timer.N_scan += testScan( comm, ut ); timer.N_scan += testScan>( comm, ut, 1 ); // Only sumScan is valid with complex data if ( comm.getSize() > 1 ) { // We can't perform a reduce on an unknown data type (this should throw an error) try { // This should fail comm.sumScan( &tmp1, &tmp2, 1 ); ut->failure( "sumReduce should give an error with an unknown type" ); } catch ( ... ) { ut->passes( "sumReduce should give an error with an unknown type" ); } try { // This should fail comm.minScan( &tmp1, &tmp2, 1 ); ut->failure( "minReduce should give an error with an unknown type" ); } catch ( ... ) { ut->passes( "minReduce should give an error with an unknown type" ); } try { // This should fail comm.maxScan( &tmp1, &tmp2, 1 ); ut->failure( "maxReduce should give an error with an unknown type" ); } catch ( ... ) { ut->passes( "maxReduce should give an error with an unknown type" ); } timer.N_scan += 3; } timer.t_scan = time() - start_time; // Test bcast start_time = time(); timer.N_bcast += testBcast( comm, ut, 0, 1 ); timer.N_bcast += testBcast( comm, ut, -1, 1 ); timer.N_bcast += testBcast( comm, ut, 0, 1 ); timer.N_bcast += testBcast( comm, ut, -1, 1 ); timer.N_bcast += testBcast( comm, ut, 0, 1 ); timer.N_bcast += testBcast( comm, ut, -1, 1 ); timer.N_bcast += testBcast( comm, ut, 0, 1 ); timer.N_bcast += testBcast( comm, ut, -1.0, 1.0 ); timer.N_bcast += testBcast( comm, ut, -1.0, 1.0 ); mytype tmp3( -1, -1.0 ); mytype tmp4( 1, 1.0 ); timer.N_bcast += testBcast( comm, ut, tmp3, tmp4 ); timer.t_bcast = time() - start_time; // Test barrier comm.barrier(); // Test gather start_time = time(); timer.N_allGather += testAllGather( comm, ut ); timer.N_allGather += testAllGather( comm, ut ); timer.N_allGather += testAllGather( comm, ut ); timer.N_allGather += testAllGather( comm, ut ); timer.N_allGather += testAllGather( comm, ut ); timer.N_allGather += testAllGather( comm, ut ); timer.N_allGather += testAllGather( comm, ut ); timer.N_allGather += testAllGather( comm, ut ); timer.N_allGather += testAllGather( comm, ut ); timer.N_allGather += testAllGather>( comm, ut ); timer.N_allGather += testAllGather( comm, ut ); timer.t_allGather = time() - start_time; // Test std::set gather start_time = time(); timer.N_setGather += testSetGather( comm, ut ); timer.N_setGather += testSetGather( comm, ut ); timer.N_setGather += testSetGather( comm, ut ); timer.N_setGather += testSetGather( comm, ut ); timer.N_setGather += testSetGather( comm, ut ); timer.N_setGather += testSetGather( comm, ut ); timer.N_setGather += testSetGather( comm, ut ); timer.N_setGather += testSetGather( comm, ut ); timer.N_setGather += testSetGather( comm, ut ); timer.t_setGather = time() - start_time; // Test std::map gather start_time = time(); timer.N_mapGather += testMapGather( comm, ut ); timer.N_mapGather += testMapGather( comm, ut ); timer.N_mapGather += testMapGather( comm, ut ); timer.N_mapGather += testMapGather( comm, ut ); timer.N_mapGather += testMapGather( comm, ut ); timer.N_mapGather += testMapGather( comm, ut ); timer.N_mapGather += testMapGather( comm, ut ); timer.N_mapGather += testMapGather( comm, ut ); timer.N_mapGather += testMapGather( comm, ut ); timer.t_mapGather = time() - start_time; // Test allToAlll start_time = time(); timer.N_allToAll += testAllToAll( comm, ut ); timer.N_allToAll += testAllToAll( comm, ut ); timer.N_allToAll += testAllToAll( comm, ut ); timer.N_allToAll += testAllToAll( comm, ut ); timer.N_allToAll += testAllToAll( comm, ut ); timer.N_allToAll += testAllToAll( comm, ut ); timer.N_allToAll += testAllToAll( comm, ut ); timer.N_allToAll += testAllToAll( comm, ut ); timer.N_allToAll += testAllToAll( comm, ut ); timer.N_allToAll += testAllToAll>( comm, ut ); timer.N_allToAll += testAllToAll( comm, ut ); timer.t_allToAll = time() - start_time; // Test send/recv start_time = time(); timer.N_sendRecv += testSendRecv( comm, ut, 0, 1 ); timer.N_sendRecv += testSendRecv( comm, ut, -1, 1 ); timer.N_sendRecv += testSendRecv( comm, ut, 0, 1 ); timer.N_sendRecv += testSendRecv( comm, ut, -1, 1 ); timer.N_sendRecv += testSendRecv( comm, ut, 0, 1 ); timer.N_sendRecv += testSendRecv( comm, ut, -1, 1 ); timer.N_sendRecv += testSendRecv( comm, ut, 0, 1 ); timer.N_sendRecv += testSendRecv( comm, ut, -1.0, 1.0 ); timer.N_sendRecv += testSendRecv( comm, ut, -1.0, 1.0 ); timer.N_sendRecv += testSendRecv( comm, ut, tmp3, tmp4 ); timer.t_sendRecv = time() - start_time; // Test Isend/Irecv start_time = time(); timer.N_IsendIrecv += testIsendIrecv( comm, ut, 0, 1 ); timer.N_IsendIrecv += testIsendIrecv( comm, ut, -1, 1 ); timer.N_IsendIrecv += testIsendIrecv( comm, ut, 0, 1 ); timer.N_IsendIrecv += testIsendIrecv( comm, ut, -1, 1 ); timer.N_IsendIrecv += testIsendIrecv( comm, ut, 0, 1 ); timer.N_IsendIrecv += testIsendIrecv( comm, ut, -1, 1 ); timer.N_IsendIrecv += testIsendIrecv( comm, ut, 0, 1 ); timer.N_IsendIrecv += testIsendIrecv( comm, ut, -1.0, 1.0 ); timer.N_IsendIrecv += testIsendIrecv( comm, ut, -1.0, 1.0 ); timer.N_IsendIrecv += testIsendIrecv( comm, ut, tmp3, tmp4 ); timer.t_IsendIrecv = time() - start_time; // Test commRanks testCommRanks( comm, ut ); PROFILE_STOP( "testComm" ); return timer; } // Test comm dup and the number of communicators that can be created void testCommDup( UnitTest *ut ) { #if defined( USING_CLANG ) && defined( __APPLE__ ) // The MPI error handler crashes so this test fails // This seems to be a MAC? + Clang + MPICH? issue only ut->expected_failure( "testCommDup skipped for this architecture/compiler" ); #else MPI_CLASS globalComm( MPI_COMM_WORLD ); MPI_CLASS dupComm = globalComm.dup(); if ( globalComm.getCommunicator() != dupComm.getCommunicator() && dupComm.getSize() == globalComm.getSize() && dupComm.getRank() == globalComm.getRank() ) { ut->passes( "dup comm" ); } else { ut->failure( "dup comm" ); return; } #if defined( USE_PETSC ) && !defined( USE_MPI ) ut->expected_failure( "Skipping dup tests, PETSc (no-mpi) has a limit of 128 unique comms" ); return; #endif int N_comm_try = 2000; // Maximum number of comms to try and create std::vector comms; comms.reserve( N_comm_try ); try { for ( int i = 0; i < N_comm_try; i++ ) { MPI_CLASS tmp_comm = globalComm.dup(); comms.push_back( tmp_comm ); MPI_ASSERT( globalComm.getCommunicator() != comms[i].getCommunicator() ); MPI_ASSERT( comms.back().sumReduce( 1 ) == globalComm.getSize() ); // We need to communicate as part of the test } ut->passes( Utilities::stringf( "Created %i comms", N_comm_try ) ); } catch ( ... ) { if ( comms.size() < 252 ) { ut->failure( "Could not create 252 different communicators" ); } else { int N = comms.size(); ut->expected_failure( Utilities::stringf( "Failed to create an unlimited number of comms (%i)", N ) ); } std::cout << "Maximum number of concurrent communicators: " << comms.size() << std::endl; } comms.clear(); size_t N_dup = 0; globalComm.barrier(); try { double start = MPI_CLASS::time(); for ( int i = 0; i < N_comm_try; i++ ) { MPI_CLASS tmp_comm1 = globalComm.dup(); MPI_CLASS tmp_comm2 = globalComm.dup(); MPI_ASSERT( globalComm.getCommunicator() != tmp_comm1.getCommunicator() ); MPI_ASSERT( globalComm.getCommunicator() != tmp_comm2.getCommunicator() ); MPI_ASSERT( tmp_comm1.getCommunicator() != tmp_comm2.getCommunicator() ); MPI_ASSERT( tmp_comm1.sumReduce( 1 ) == globalComm.getSize() ); // We need to communicate as part of the test MPI_ASSERT( tmp_comm2.sumReduce( 1 ) == globalComm.getSize() ); // We need to communicate as part of the test N_dup += 2; } double stop = MPI_CLASS::time(); ut->passes( "Created/Destroyed an unlimited number of comms" ); char message[128]; sprintf( message, "Time to create/destroy comm using MPI_CLASS::dup() is: %0.1f us", 1e6 * ( stop - start ) / N_dup ); std::cout << message << std::endl; } catch ( ... ) { ut->failure( "Failed to create/destroy an unlimited number of comms" ); std::cout << "Maximum number of communicators created with destruction: " << N_dup << std::endl; } #endif } class gpuWrapper{ public: gpuWrapper(MPI_CLASS MPI_COMM, int MSG_SIZE); ~gpuWrapper(); void Send(double *values); void Recv(double *values); double *sendbuf, *recvbuf; private: MPI_Request req1[1],req2[1]; MPI_CLASS comm; int sendCount; int recvCount; int rank, rank_x, rank_X, nprocs; int sendtag, recvtag; }; gpuWrapper::gpuWrapper(MPI_CLASS MPI_COMM, int MSG_SIZE){ comm = MPI_COMM.dup(); rank = comm.getRank(); nprocs = comm.getSize(); sendCount=MSG_SIZE; recvCount=MSG_SIZE; ScaLBL_AllocateZeroCopy((void **) &sendbuf, sendCount*sizeof(double)); // Allocate device memory ScaLBL_AllocateZeroCopy((void **) &recvbuf, sendCount*sizeof(double)); // Allocate device memory rank_X = rank+1; rank_x = rank-1; if (rank_x < 0) rank_x = nprocs-1; if (!(rank_X < nprocs)) rank_X = 0; } gpuWrapper::~gpuWrapper(){ ScaLBL_FreeDeviceMemory(sendbuf); ScaLBL_FreeDeviceMemory(recvbuf); } void gpuWrapper::Send(double *values){ sendtag = recvtag = 130; ScaLBL_CopyToDevice(sendbuf,values,sendCount*sizeof(double)); req1[0] = comm.Isend(sendbuf,sendCount,rank_x,sendtag+0); req2[0] = comm.Irecv(recvbuf,recvCount,rank_X,recvtag+0); } void gpuWrapper::Recv(double *values){ comm.waitAll(1,req1); comm.waitAll(1,req2); ScaLBL_DeviceBarrier(); ScaLBL_CopyToHost(values,recvbuf,recvCount*sizeof(double)); } // Test GPU aware MPI void test_GPU_aware( UnitTest *ut ) { constexpr size_t N = 1024*1024; constexpr size_t N_msg = 64; bool test = true; // Get the comm to use MPI_CLASS comm( MPI_COMM_WORLD ); int rank = comm.getRank(); int size = comm.getSize(); try { // Initialize the device int device = ScaLBL_SetDevice(rank); NULL_USE( device ); // create wrapper for communications gpuWrapper gpuComm(comm, N); // Allocate and initialize the buffers size_t bytes = N*sizeof(double); double *device_send[N_msg] = { nullptr }; double *device_recv[N_msg] = { nullptr }; double *host_send[N_msg] = { nullptr }; double *host_recv[N_msg] = { nullptr }; for ( size_t k=0; kpasses("GPU aware MPI" ); } else { std::cout << "MPI is NOT GPU aware" << std::endl; ut->failure("GPU aware MPI" ); } } // This test will test the MPI class int main( int argc, char *argv[] ) { // Start MPI Utilities::MPI::start_MPI( argc, argv ); // Create the unit test UnitTest ut; PROFILE_ENABLE( 0 ); PROFILE_START( "Main" ); // Limit the scope so objects are destroyed { // Get the start time for the tests double start_time = time(); // Print the global size (if we are using MPI) int global_size = 0; #ifdef USE_MPI MPI_Comm_size( MPI_COMM_WORLD, &global_size ); #else global_size = 1; #endif // Test the global communicator (MPI_COMM_WORLD) MPI_CLASS globalComm = MPI_CLASS( MPI_COMM_WORLD ); if ( !globalComm.isNull() ) ut.passes( "Global communicator created" ); else ut.failure( "Global communicator created" ); if ( globalComm.getSize() == global_size ) ut.passes( "Global communicator size" ); else ut.failure( "Global communicator size" ); if ( globalComm.getRank() == 0 ) { std::cout << "MPI_COMM_WORLD = " << global_size << " processors" << std::endl; std::cout << " Largest tag value = " << globalComm.maxTag() << std::endl << std::endl; } #ifdef USE_MPI if ( globalComm.getCommunicator() == MPI_COMM_WORLD ) ut.passes( "Communicator == MPI_COMM_WORLD" ); else ut.failure( "Communicator == MPI_COMM_WORLD" ); #endif testCommTimerResults commTimer = testComm( globalComm, &ut ); if ( globalComm.getRank() == 0 ) { std::cout << "Results for global timer (rank 0)" << std::endl; commTimer.print(); std::cout << std::endl; } // Test bcast with std::string std::string rank_string; if ( globalComm.getRank() == 0 ) rank_string = "Rank 0"; rank_string = globalComm.bcast( rank_string, 0 ); if ( rank_string == "Rank 0" ) ut.passes( "Bcast std::string" ); else ut.failure( "Bcast std::string" ); // Test MPI_COMM_SELF MPI_CLASS selfComm = MPI_CLASS( MPI_COMM_SELF ); if ( !selfComm.isNull() ) ut.passes( "Self communicator created" ); else ut.failure( "Self communicator created" ); #ifdef USE_MPI if ( selfComm.getCommunicator() == MPI_COMM_SELF ) ut.passes( "Communicator == MPI_COMM_SELF" ); else ut.failure( "Communicator == MPI_COMM_SELF" ); #endif testComm( selfComm, &ut ); // Test == and != if ( globalComm == globalComm && !( selfComm == globalComm ) ) ut.passes( "==" ); else ut.failure( "==" ); if ( selfComm != globalComm && !( globalComm != globalComm ) ) ut.passes( "!=" ); else ut.failure( "!=" ); // Test MPI_COMM_NULL MPI_CLASS nullComm = MPI_CLASS( MPI_COMM_NULL ); if ( nullComm.isNull() ) ut.passes( "Null communicator created" ); else ut.failure( "Null communicator created" ); if ( nullComm.getSize() == 0 ) ut.passes( "Null communicator has zero size" ); else ut.failure( "Null communicator has zero size" ); #ifdef USE_MPI if ( nullComm.getCommunicator() == MPI_COMM_NULL ) ut.passes( "Communicator == MPI_COMM_NULL" ); else ut.failure( "Communicator == MPI_COMM_NULL" ); #endif // Test dup MPI_CLASS dupComm = globalComm.dup(); if ( nullComm.dup().isNull() ) ut.passes( "Null communicator duplicates a Null communicator" ); else ut.failure( "Null communicator duplicates a Null communicator" ); testCommDup( &ut ); // Test compare if ( globalComm.compare( globalComm ) == 1 ) ut.passes( "compare comm global==global" ); else ut.failure( "compare comm global==global" ); if ( globalComm.compare( dupComm ) == 3 ) ut.passes( "compare comm global~=dup" ); else ut.failure( "compare comm global~=dup" ); if ( global_size == 1 ) { if ( globalComm.compare( selfComm ) == 3 ) ut.passes( "compare comm global~=self (global size=1)" ); else ut.failure( "compare comm global~=self (global size=1)" ); } else { if ( globalComm.compare( selfComm ) == 0 ) ut.passes( "compare comm global!=self" ); else ut.failure( "compare comm global!=self" ); } // Split the global comm and test PROFILE_START( "Split" ); int color; if ( globalComm.getRank() == 0 ) color = 0; else if ( globalComm.getRank() < 3 ) color = 1; else color = 2 + ( globalComm.getRank() - 2 ) / 4; std::vector splitComms( 4 ); splitComms[0] = globalComm.split( color ); splitComms[1] = globalComm.split( color, globalComm.getRank() ); if ( splitComms[0].getCommunicator() != globalComm.getCommunicator() && splitComms[1].getCommunicator() != globalComm.getCommunicator() && splitComms[0].getCommunicator() != splitComms[1].getCommunicator() ) ut.passes( "split comm has different communicator" ); else ut.failure( "split comm has different communicator" ); if ( globalComm.getSize() > 1 ) { if ( splitComms[0].getSize() < globalComm.getSize() ) ut.passes( "split comm is smaller" ); else ut.failure( "split comm is smaller" ); } if ( splitComms[0].getRank() == splitComms[1].getRank() ) ut.passes( "split sort by rank" ); else ut.failure( "split sort by rank" ); testComm( splitComms[0], &ut ); splitComms[2] = globalComm.split( -1 ); if ( splitComms[2].isNull() ) ut.passes( "split with color=-1 returns NULL communicator" ); else ut.failure( "split with color=-1 returns NULL communicator" ); splitComms[3] = splitComms[0]; // Make a copy to ensure there are no memory leaks splitComms[3] = splitComms[2]; // Perform assignement to check memory leaks MPI_ASSERT( splitComms[3] == splitComms[2] ); PROFILE_STOP( "Split" ); // Test < <= > >= if ( globalComm.getSize() > 1 ) { if ( splitComms[0] < globalComm && splitComms[1] < globalComm && !( globalComm < globalComm ) && !( globalComm < splitComms[0] ) ) ut.passes( " < comm" ); else ut.failure( " < comm" ); if ( splitComms[0] <= globalComm && splitComms[1] <= globalComm && globalComm <= globalComm && !( globalComm <= splitComms[0] ) ) ut.passes( " <= comm" ); else ut.failure( " <= comm" ); if ( globalComm > splitComms[0] && globalComm > splitComms[1] && !( globalComm > globalComm ) && !( splitComms[0] > globalComm ) ) ut.passes( " > comm" ); else ut.failure( " > comm" ); if ( globalComm >= splitComms[0] && globalComm >= splitComms[1] && globalComm >= globalComm && !( splitComms[0] >= globalComm ) ) ut.passes( " >= comm" ); else ut.failure( " >= comm" ); } // Test intersection // Test globalComm with selfComm if ( globalComm.getSize() > 1 ) { MPI_CLASS comm1 = MPI_CLASS::intersect( globalComm, selfComm ); MPI_CLASS comm2 = MPI_CLASS::intersect( selfComm, globalComm ); MPI_CLASS comm3 = MPI_CLASS::intersect( globalComm, globalComm ); if ( comm1.compare( globalComm ) == 0 && comm1.compare( selfComm ) != 0 && comm2.compare( globalComm ) == 0 && comm2.compare( selfComm ) != 0 && comm3.compare( globalComm ) != 0 && comm3.compare( selfComm ) == 0 ) ut.passes( "intersection of globalComm and selfComm" ); else ut.failure( "intersection of globalComm and selfComm" ); } // Test case where we have disjoint sets (this can only happen of one of the comms is null) { MPI_CLASS intersection = MPI_CLASS::intersect( globalComm, nullComm ); if ( intersection.isNull() ) ut.passes( "intersection of non-overlapping comms" ); else ut.failure( "intersection of non-overlapping comms" ); } // Test case where the comms partially overlap if ( globalComm.getSize() > 2 ) { int n = globalComm.getSize() - 1; // Intersect 2 comms (all other ranks will be null) MPI_CLASS split1 = globalComm.split( globalComm.getRank() == 0 ? -1 : 0 ); MPI_CLASS split2 = globalComm.split( globalComm.getRank() == n ? -1 : 0 ); MPI_CLASS intersection = MPI_CLASS::intersect( split1, split2 ); bool pass = true; if ( globalComm.getRank() == 0 || globalComm.getRank() == n ) { if ( !intersection.isNull() ) pass = false; } else { if ( intersection.compare( split1 ) != 0 || intersection.compare( split2 ) != 0 || intersection.getSize() != globalComm.getSize() - 2 ) pass = false; } // Intersect 2 sets for ranks (3 groups should result) // split1 = globalComm.split(globalComm.getRank()==0?1:2); // split2 = globalComm.split(globalComm.getRank()==n?1:2); // intersection = MPI_CLASS::intersect( split1, split2 ); // bool pass = true; // if ( globalComm.getRank()==0 || globalComm.getRank()==n ) { // if ( intersection.compare(selfComm)==0 ) // pass = false; //} else { // if ( intersection.compare(split1)!=0 || intersection.compare(split2)!=0 || // intersection.getSize()!=globalComm.getSize()-2 ) // pass = false; //} if ( pass ) ut.passes( "intersection of partially overlapping comms" ); else ut.failure( "intersection of partially overlapping comms" ); } // Test splitByNode MPI_CLASS nodeComm = globalComm.splitByNode(); std::string localName = MPI_CLASS::getNodeName(); std::vector globalStrings( globalComm.getSize() ); std::vector nodeStrings( nodeComm.getSize() ); globalComm.allGather( localName, &globalStrings[0] ); nodeComm.allGather( localName, &nodeStrings[0] ); int N_local = 0; for ( auto &nodeString : nodeStrings ) { if ( nodeString == localName ) N_local++; } int N_global = 0; for ( auto &globalString : globalStrings ) { if ( globalString == localName ) N_global++; } if ( !nodeComm.isNull() && N_local == nodeComm.getSize() && N_local == N_global ) ut.passes( "splitByNode" ); else ut.failure( "splitByNode" ); // Test the call to load balance the processes MPI_CLASS::balanceProcesses( globalComm, 1 ); std::vector cpus = MPI_CLASS::getProcessAffinity(); size_t maxProcNode = nodeComm.maxReduce( cpus.size() ); bool pass_balance = cpus.size() == maxProcNode && !cpus.empty(); MPI_CLASS::balanceProcesses( globalComm, 2 ); cpus = MPI_CLASS::getProcessAffinity(); if ( cpus.size() < 1 || cpus.size() > maxProcNode / nodeComm.getSize() ) pass_balance = false; if ( pass_balance ) { ut.passes( "balanceProcesses" ); } else { #ifdef __APPLE__ ut.expected_failure( "balanceProcesses" ); #else ut.failure( "balanceProcesses" ); #endif } // Test the performance of sched_yield (used internally by MPI wait routines) globalComm.barrier(); double start_yield = time(); for ( int i = 0; i < 10000; i++ ) sched_yield(); double time_yield = ( time() - start_yield ) / 10000; if ( globalComm.getRank() == 0 ) std::cout << "Time to yield: " << time_yield * 1e6 << " us" << std::endl; // Test time and tick double end_time = MPI_CLASS::time(); double time_res = MPI_CLASS::tick(); if ( globalComm.getRank() == 0 ) { std::cout << "Time to run tests: " << end_time - start_time << std::endl; std::cout << "Timer resolution: " << time_res << std::endl; if ( time_res > 0 && time_res < 1 && ( end_time - start_time ) >= time_res ) ut.passes( "time and tick" ); else ut.failure( "time and tick" ); std::cout << std::endl; } // Test GPU aware MPI test_GPU_aware( &ut ); } // Limit the scope so objects are destroyed // Finished testing, report the results PROFILE_START( "Report" ); double start_time = time(); ut.report(); int num_failed = ut.NumFailGlobal(); double end_time = time(); if ( MPI_CLASS( MPI_COMM_WORLD ).getRank() == 0 ) std::cout << "Time to report: " << end_time - start_time << std::endl << std::endl; PROFILE_STOP( "Report" ); PROFILE_STOP( "Main" ); // Shutdown PROFILE_SAVE( "test_MPI" ); ut.reset(); Utilities::MPI::stop_MPI(); return num_failed; }