Files
LBPM/threadpool/test/test_thread_pool.cpp
2018-02-06 10:50:43 -05:00

968 lines
35 KiB
C++

#include "ProfilerApp.h"
#ifdef USE_TIMER
#include "MemoryApp.h"
#endif
#include "threadpool/thread_pool.h"
#include "common/UnitTest.h"
#include "common/Utilities.h"
#include <algorithm>
#include <cmath>
#include <cstdio>
#include <cstdlib>
#include <iostream>
#include <mutex>
#include <stdexcept>
#include <string>
#include <vector>
#define MAX( x, y ) ( ( x ) > ( y ) ? ( x ) : ( y ) )
#define perr std::cerr
#define pout std::cout
#define printp printf
#ifdef USE_MPI
#include "mpi.h"
#endif
#define to_ns( x ) std::chrono::duration_cast<std::chrono::nanoseconds>( x ).count()
#define to_ms( x ) std::chrono::duration_cast<std::chrono::milliseconds>( x ).count()
// Wrapper functions for mpi
static inline void barrier()
{
#ifdef USE_MPI
MPI_Barrier( MPI_COMM_WORLD );
#endif
}
static inline int getRank()
{
int rank = 0;
#ifdef USE_MPI
MPI_Comm_rank( MPI_COMM_WORLD, &rank );
#endif
return rank;
}
static inline int getSize()
{
int size = 0;
#ifdef USE_MPI
MPI_Comm_size( MPI_COMM_WORLD, &size );
#endif
return size;
}
// Function to waste CPU cycles
void waste_cpu( int N )
{
if ( N > 10000 ) {
PROFILE_START( "waste_cpu", 2 );
}
double pi = 3.141592653589793;
double x = 1.0;
N = std::max( 10, N );
{
for ( int i = 0; i < N; i++ )
x = sqrt( x * exp( pi / x ) );
} // style to limit gcov hits
if ( fabs( x - 2.926064057273157 ) > 1e-12 ) {
abort();
}
if ( N > 10000 ) {
PROFILE_STOP( "waste_cpu", 2 );
}
}
// Sleep for the given time
// Note: since we may encounter interrupts, we may not sleep for the desired time
// so we need to perform the sleep in a loop
void sleep_ms( int64_t N )
{
auto t1 = std::chrono::high_resolution_clock::now();
auto t2 = std::chrono::high_resolution_clock::now();
while ( to_ms( t2 - t1 ) < N ) {
int N2 = N - to_ms( t2 - t1 );
std::this_thread::sleep_for( std::chrono::milliseconds( N2 ) );
t2 = std::chrono::high_resolution_clock::now();
}
}
void sleep_s( int N ) { sleep_ms( 1000 * N ); }
// Function to sleep for N seconds then increment a global count
static volatile int global_sleep_count = 0;
void sleep_inc( int N )
{
PROFILE_START( "sleep_inc" );
sleep_s( N );
++global_sleep_count;
PROFILE_STOP( "sleep_inc" );
}
void sleep_inc2( double x )
{
sleep_ms( static_cast<int>( round( x * 1000 ) ) );
++global_sleep_count;
}
void sleep_msg( double x, std::string msg )
{
PROFILE_START( msg );
sleep_ms( static_cast<int>( round( x * 1000 ) ) );
NULL_USE( msg );
PROFILE_STOP( msg );
}
bool check_inc( int N ) { return global_sleep_count == N; }
// Function to return the processor for the given thread
std::mutex print_processor_mutex;
void print_processor( ThreadPool *tpool )
{
int rank = 0;
#ifdef USE_MPI
MPI_Comm_rank( MPI_COMM_WORLD, &rank );
#endif
int thread = tpool->getThreadNumber();
int processor = ThreadPool::getCurrentProcessor();
char tmp[100];
sprintf( tmp, "%i: Thread,proc = %i,%i\n", rank, thread, processor );
sleep_ms( 10 * rank );
print_processor_mutex.lock();
pout << tmp;
print_processor_mutex.unlock();
sleep_ms( 100 );
}
// Function to test how a member thread interacts with the thread pool
int test_member_thread( ThreadPool *tpool )
{
int N_errors = 0;
// Member threads are not allowed to wait for the pool to finish
try {
tpool->wait_pool_finished();
N_errors++;
} catch ( ... ) {
}
// Member threads are not allowed to change the size of the pool
try {
tpool->wait_pool_finished();
N_errors++;
} catch ( ... ) {
}
return N_errors;
}
/******************************************************************
* Test the TPOOL_ADD_WORK macro with variable number of arguments *
******************************************************************/
static int myfun0() { return 0; }
static int myfun1( int ) { return 1; }
static int myfun2( int, float ) { return 2; }
static int myfun3( int, float, double ) { return 3; }
static int myfun4( int, float, double, char ) { return 4; }
static int myfun5( int, float, double, char, std::string ) { return 5; }
static int myfun6( int, float, double, char, std::string, int ) { return 6; }
static int myfun7( int, float, double, char, std::string, int, int ) { return 7; }
static int test_function_arguements( ThreadPool *tpool )
{
int N_errors = 0;
// Test some basic types of instantiations
ThreadPool::thread_id_t id0 = TPOOL_ADD_WORK( tpool, myfun0, ( nullptr ) );
ThreadPool::thread_id_t id1 = TPOOL_ADD_WORK( tpool, myfun1, ( (int) 1 ) );
ThreadPool::thread_id_t id2 = TPOOL_ADD_WORK( tpool, myfun2, ( (int) 1, (float) 2 ) );
ThreadPool::thread_id_t id3 =
TPOOL_ADD_WORK( tpool, myfun3, ( (int) 1, (float) 2, (double) 3 ) );
ThreadPool::thread_id_t id4 =
TPOOL_ADD_WORK( tpool, myfun4, ( (int) 1, (float) 2, (double) 3, (char) 4 ) );
ThreadPool::thread_id_t id5 = TPOOL_ADD_WORK(
tpool, myfun5, ( (int) 1, (float) 2, (double) 3, (char) 4, std::string( "test" ) ) );
ThreadPool::thread_id_t id52 = TPOOL_ADD_WORK(
tpool, myfun5, ( (int) 1, (float) 2, (double) 3, (char) 4, std::string( "test" ) ), -1 );
ThreadPool::thread_id_t id6 = TPOOL_ADD_WORK( tpool, myfun6,
( (int) 1, (float) 2, (double) 3, (char) 4, std::string( "test" ), (int) 1 ) );
ThreadPool::thread_id_t id7 = TPOOL_ADD_WORK( tpool, myfun7,
( (int) 1, (float) 2, (double) 3, (char) 4, std::string( "test" ), (int) 1, (int) 1 ) );
tpool->wait_pool_finished();
if ( !tpool->isFinished( id0 ) ) {
N_errors++;
}
if ( tpool->getFunctionRet<int>( id0 ) != 0 ) {
N_errors++;
}
if ( tpool->getFunctionRet<int>( id1 ) != 1 ) {
N_errors++;
}
if ( tpool->getFunctionRet<int>( id2 ) != 2 ) {
N_errors++;
}
if ( tpool->getFunctionRet<int>( id3 ) != 3 ) {
N_errors++;
}
if ( tpool->getFunctionRet<int>( id4 ) != 4 ) {
N_errors++;
}
if ( tpool->getFunctionRet<int>( id5 ) != 5 ) {
N_errors++;
}
if ( tpool->getFunctionRet<int>( id52 ) != 5 ) {
N_errors++;
}
if ( tpool->getFunctionRet<int>( id6 ) != 6 ) {
N_errors++;
}
if ( tpool->getFunctionRet<int>( id7 ) != 7 ) {
N_errors++;
}
return N_errors;
}
/******************************************************************
* Examples to derive a user work item *
******************************************************************/
class UserWorkItemVoid : public ThreadPool::WorkItem
{
public:
// User defined constructor (does not need to match any interfaces)
explicit UserWorkItemVoid( int dummy )
{
// User initialized variables
NULL_USE( dummy );
}
// User defined run (can do anything)
void run() override
{
// Perform the tasks
printf( "Hello work from UserWorkItem (void)" );
}
// Will the routine return a result
bool has_result() const override { return false; }
// User defined destructor
~UserWorkItemVoid() override = default;
};
class UserWorkItemInt : public ThreadPool::WorkItemRet<int>
{
public:
// User defined constructor (does not need to match any interfaces)
explicit UserWorkItemInt( int dummy )
{
// User initialized variables
NULL_USE( dummy );
}
// User defined run (can do anything)
void run() override
{
// Perform the tasks
printf( "Hello work from UserWorkItem (int)" );
// Store the results (it's type will match the template)
ThreadPool::WorkItemRet<int>::d_result = 1;
}
// User defined destructor
~UserWorkItemInt() override = default;
};
/******************************************************************
* test the time to run N tasks in parallel *
******************************************************************/
template<class Ret, class... Args>
inline double launchAndTime( ThreadPool &tpool, int N, Ret ( *routine )( Args... ), Args... args )
{
tpool.wait_pool_finished();
auto start = std::chrono::high_resolution_clock::now();
for ( int i = 0; i < N; i++ )
ThreadPool_add_work( &tpool, 0, routine, args... );
tpool.wait_pool_finished();
auto stop = std::chrono::high_resolution_clock::now();
return std::chrono::duration<double>( stop - start ).count();
}
// Move constructor function
volatile ThreadPool::thread_id_t f1( volatile ThreadPool::thread_id_t a ) { return a; }
ThreadPool::thread_id_t f2( ThreadPool::thread_id_t a ) { return a; }
/******************************************************************
* Test the basic functionallity of the atomics *
******************************************************************/
int test_atomics()
{
using namespace AtomicOperations;
int N_errors = 0;
volatile int32_atomic i32;
volatile int64_atomic i64;
i32 = 32;
i64 = 64;
if ( atomic_increment( &i32 ) != 33 || atomic_increment( &i64 ) != 65 )
N_errors++;
if ( atomic_decrement( &i32 ) != 32 || atomic_decrement( &i64 ) != 64 )
N_errors++;
if ( atomic_add( &i32, 2 ) != 34 || atomic_add( &i64, 4 ) != 68 )
N_errors++;
if ( atomic_compare_and_swap( &i32, 0, 0 ) || atomic_compare_and_swap( &i64, 0, 0 ) )
N_errors++;
if ( !atomic_compare_and_swap( &i32, 34, 32 ) || !atomic_compare_and_swap( &i64, 68, 64 ) )
N_errors++;
if ( i32 != 32 || i64 != 64 )
N_errors++;
return N_errors;
}
/******************************************************************
* Test FIFO behavior *
******************************************************************/
void test_FIFO( UnitTest &ut, ThreadPool &tpool )
{
int rank = getRank();
int size = getSize();
const int N = 4000;
for ( int r = 0; r < size; r++ ) {
barrier();
if ( r != rank )
continue;
std::vector<ThreadPool::thread_id_t> ids;
ids.reserve( N );
for ( size_t i = 0; i < N; i++ )
ids.emplace_back( TPOOL_ADD_WORK( &tpool, sleep_inc2, ( 0.001 ) ) );
bool pass = true;
while ( tpool.N_queued() > 0 ) {
int i1 = -1, i2 = ids.size();
for ( int i = N - 1; i >= 0; i-- ) {
bool started = ids[i].started();
if ( started )
i1 = std::max<int>( i1, i ); // Last index to processing item
else
i2 = std::min<int>( i2, i ); // First index to queued item
}
int diff = i1 == -1 ? 0 : ( i2 - i1 - 1 );
if ( abs( diff ) > 4 ) {
printf( "%i %i %i\n", i1, i2, diff );
pass = pass && abs( i2 - i1 - 1 ) <= 2;
}
}
ids.clear();
tpool.wait_pool_finished();
if ( pass )
ut.passes( "Thread pool behaves as FIFO" );
else
ut.failure( "Thread pool does not behave as FIFO" );
}
}
/******************************************************************
* The main program *
******************************************************************/
#ifdef USE_WINDOWS
int __cdecl main( int argc, char **argv )
{
#elif defined( USE_LINUX ) || defined( USE_MAC )
int main( int argc, char *argv[] )
{
#else
#error Unknown OS
#endif
int N_threads = 4; // Number of threads
int N_work = 2000; // Number of work items
int N_it = 10; // Number of cycles to run
int N_problem = 5; // Problem size
PROFILE_ENABLE( 3 );
PROFILE_ENABLE_TRACE();
PROFILE_DISABLE_MEMORY();
UnitTest ut;
// Initialize MPI and set the error handlers
#ifdef USE_MPI
int provided_thread_support = -1;
MPI_Init_thread( &argc, &argv, MPI_THREAD_MULTIPLE, &provided_thread_support );
Utilities::setErrorHandlers();
// Disable OS specific warnings for all non-root ranks
#endif
int rank = getRank();
int size = getSize();
if ( rank > 0 )
ThreadPool::set_OS_warnings( 1 );
NULL_USE( size );
NULL_USE( argc );
NULL_USE( argv );
// Test the atomics
if ( test_atomics() == 0 )
ut.passes( "Atomics passed" );
else
ut.failure( "Atomics failed" );
// Initialize the data
std::vector<int> data1( N_work, 0 );
std::vector<int> priority( N_work, 0 );
for ( int i = 0; i < N_work; i++ ) {
data1[i] = N_problem;
priority[i] = i % 128;
}
// Print the size of the thread pool class
printp( "Size of ThreadPool = %i\n", (int) sizeof( ThreadPool ) );
// Get the number of processors availible
barrier();
int N_procs = ThreadPool::getNumberOfProcessors();
if ( N_procs > 0 )
ut.passes( "getNumberOfProcessors" );
else
ut.failure( "getNumberOfProcessors" );
printp( "%i processors availible\n", N_procs );
// Get the processor affinities for the process
barrier();
std::vector<int> cpus = ThreadPool::getProcessAffinity();
printp( "%i cpus for current process: ", (int) cpus.size() );
for ( int cpu : cpus )
printp( "%i ", cpu );
printp( "\n" );
if ( !cpus.empty() ) {
ut.passes( "getProcessAffinity" );
} else {
#ifdef __APPLE__
ut.expected_failure( "getProcessAffinity" );
#else
ut.failure( "getProcessAffinity" );
#endif
}
// Test setting the process affinities
barrier();
bool pass = false;
if ( !cpus.empty() && N_procs > 0 ) {
if ( cpus.size() == 1 ) {
cpus.resize( N_procs );
for ( int i = 0; i < N_procs; i++ )
cpus.push_back( i );
try {
ThreadPool::setProcessAffinity( cpus );
} catch ( ... ) {
}
cpus = ThreadPool::getProcessAffinity();
std::vector<int> cpus = ThreadPool::getProcessAffinity();
printp( "%i cpus for current process (updated): ", (int) cpus.size() );
for ( int cpu : cpus )
printp( "%i ", cpu );
printp( "\n" );
pass = cpus.size() > 1;
} else {
std::vector<int> cpus_orig = cpus;
std::vector<int> cpus_tmp( 1, cpus[0] );
try {
ThreadPool::setProcessAffinity( cpus_tmp );
} catch ( ... ) {
}
cpus = ThreadPool::getProcessAffinity();
if ( cpus.size() == 1 )
pass = true;
try {
ThreadPool::setProcessAffinity( cpus_orig );
} catch ( ... ) {
}
cpus = ThreadPool::getProcessAffinity();
if ( cpus.size() != cpus_orig.size() )
pass = false;
}
}
if ( pass ) {
ut.passes( "setProcessAffinity" );
} else {
#ifdef __APPLE__
ut.expected_failure( "setProcessAffinity" );
#else
ut.failure( "setProcessAffinity" );
#endif
}
int N_procs_used = std::min<int>( N_procs, N_threads );
printp( "%i processors used\n", N_procs_used );
// Create the thread pool
barrier();
printp( "Creating thread pool\n" );
ThreadPool tpool0;
ThreadPool tpool;
ThreadPool::thread_id_t id;
id = TPOOL_ADD_WORK( &tpool, waste_cpu, ( data1[0] ) );
if ( id == ThreadPool::thread_id_t() || !tpool.isValid( id ) )
ut.failure( "Errors with id" );
tpool.setNumThreads( N_threads );
if ( tpool.getNumThreads() == N_threads )
ut.passes( "Created thread pool" );
else
ut.failure( "Failed to create tpool with desired number of threads" );
// Test setting the thread affinities
barrier();
if ( cpus.size() > 1 ) {
sleep_ms( 50 );
// First make sure we can get the thread affinities
std::vector<int> procs = ThreadPool::getThreadAffinity();
if ( procs == cpus ) {
ut.passes( "getThreadAffinity() matches procs" );
} else {
char msg[100];
sprintf( msg, "getThreadAffinity() does not match procs (%i,%i)",
static_cast<int>( procs.size() ), static_cast<int>( cpus.size() ) );
ut.failure( msg );
}
pass = true;
for ( int i = 0; i < N_threads; i++ ) {
std::vector<int> procs_thread = tpool.getThreadAffinity( i );
if ( procs_thread != procs ) {
printp( "%i: Initial thread affinity: ", rank );
for ( int i : procs_thread )
printp( "%i ", i );
printp( "\n" );
pass = false;
}
}
if ( pass )
ut.passes( "getThreadAffinity(thread) matches procs" );
else
ut.failure( "getThreadAffinity(thread) does not match procs" );
// Try to set the thread affinities
pass = true;
if ( !procs.empty() ) {
int N_procs_thread = std::max<int>( (int) cpus.size() / N_threads, 1 );
for ( int i = 0; i < N_threads; i++ ) {
std::vector<int> procs_thread( N_procs_thread, -1 );
for ( int j = 0; j < N_procs_thread; j++ )
procs_thread[j] = procs[( i * N_procs_thread + j ) % procs.size()];
tpool.setThreadAffinity( i, procs_thread );
sleep_ms( 10 ); // Give time for OS to update thread affinities
std::vector<int> procs_thread2 = tpool.getThreadAffinity( i );
if ( procs_thread2 != procs_thread ) {
printp( "%i: Final thread affinity: ", rank );
for ( int i : procs_thread )
printp( "%i ", i );
printp( "\n" );
pass = false;
}
}
}
if ( pass )
ut.passes( "setThreadAffinity passes" );
else
ut.failure( "setThreadAffinity failed to change affinity" );
}
// Reset the thread affinities
barrier();
tpool.setNumThreads( tpool.getNumThreads(), "none" );
// tpool.setNumThreads(tpool.getNumThreads(),"independent");
for ( int i = 0; i < N_threads; i++ ) {
std::vector<int> procs_thread = tpool.getThreadAffinity( i );
printp( "Thread affinity: " );
for ( int i : procs_thread )
printp( "%i ", i );
printp( "\n" );
}
// Print the current processors by thread id
barrier();
ThreadPool::set_OS_warnings( 1 );
print_processor( &tpool );
launchAndTime( tpool, N_threads, print_processor, &tpool );
// Run some basic tests
barrier();
auto start = std::chrono::high_resolution_clock::now();
for ( int n = 0; n < N_it; n++ ) {
for ( int i = 0; i < N_work; i++ )
waste_cpu( data1[i] );
}
auto stop = std::chrono::high_resolution_clock::now();
double time = std::chrono::duration<double>( stop - start ).count();
printp( "Time for serial cycle = %0.0f us\n", 1e6 * time / N_it );
printp( "Time for serial item = %0.0f ns\n", 1e9 * time / ( N_it * N_work ) );
id = TPOOL_ADD_WORK( &tpool, waste_cpu, ( data1[0] ) );
tpool.wait( id );
std::vector<ThreadPool::thread_id_t> ids2;
ids2.push_back( TPOOL_ADD_WORK( &tpool, waste_cpu, ( data1[0] ) ) );
tpool.wait( ids2[0] );
// Test the move operator for thread_id
ThreadPool::thread_id_t id1 = f1( id ); // move-construct from rvalue temporary
ThreadPool::thread_id_t id2 = std::move( id1 ); // move-construct from xvalue
volatile ThreadPool::thread_id_t id3 = f2( id ); // move-construct from rvalue temporary
volatile ThreadPool::thread_id_t id4 = std::move( id3 ); // move-construct from xvalue
id2.reset();
id4.reset();
// Test calling functions with different number of arguments
barrier();
printp( "Testing arguments:\n" );
int N_errors_args = test_function_arguements( &tpool );
if ( N_errors_args == 0 )
ut.passes( "Calling function with default arguments" );
else
ut.failure( "Error calling function with default arguments" );
// Check that the threads can sleep in parallel (this does not depend on the number of
// processors)
barrier();
tpool.wait_pool_finished();
start = std::chrono::high_resolution_clock::now();
sleep_inc( 1 );
stop = std::chrono::high_resolution_clock::now();
double sleep_serial = std::chrono::duration<double>( stop - start ).count();
double sleep_parallel = launchAndTime( tpool, N_threads, sleep_inc, 1 );
double sleep_speedup = N_procs_used * sleep_serial / sleep_parallel;
printf( "%i: Speedup on %i sleeping threads: %0.3f\n", rank, N_procs_used, sleep_speedup );
printf( "%i: ts = %0.3f, tp = %0.3f\n", rank, sleep_serial, sleep_parallel );
if ( fabs( sleep_serial - 1.0 ) < 0.05 && fabs( sleep_parallel - 1.0 ) < 0.25 &&
sleep_speedup > 3 )
ut.passes( "Passed thread sleep" );
else
ut.failure( "Failed thread sleep" );
// Check that the threads are actually working in parallel
barrier();
if ( N_procs_used > 1 ) {
#ifdef USE_MPI
// Use a non-blocking serialization of the MPI processes
// if we do not have a sufficient number of processors
bool serialize_mpi = N_procs < N_threads * size;
int buf;
MPI_Request request;
MPI_Status status;
if ( serialize_mpi && rank > 0 ) {
MPI_Irecv( &buf, 1, MPI_INT, rank - 1, 0, MPI_COMM_WORLD, &request );
int flag = false;
while ( !flag ) {
MPI_Test( &request, &flag, &status );
sleep_s( 1 );
}
}
#endif
int N = 20000000; // Enough work to keep the processor busy for ~ 1 s
// Run in serial
start = std::chrono::high_resolution_clock::now();
waste_cpu( N );
stop = std::chrono::high_resolution_clock::now();
double time_serial = std::chrono::duration<double>( stop - start ).count();
// Run in parallel
double time_parallel = launchAndTime( tpool, N_procs_used, waste_cpu, N );
double time_parallel2 = launchAndTime( tpool, N_procs_used, waste_cpu, N / 1000 );
double speedup = N_procs_used * time_serial / time_parallel;
printf( "%i: Speedup on %i procs: %0.3f\n", rank, N_procs_used, speedup );
printf( "%i: ts = %0.3f, tp = %0.3f, tp2 = %0.3f\n", rank, time_serial, time_parallel,
time_parallel2 );
if ( speedup > 1.4 ) {
ut.passes( "Passed speedup test" );
} else {
#ifdef USE_GCOV
ut.expected_failure( "Times do not indicate tests are running in parallel (gcov)" );
#else
ut.failure( "Times do not indicate tests are running in parallel" );
#endif
}
#ifdef USE_MPI
if ( serialize_mpi ) {
if ( rank < size - 1 )
MPI_Send( &N, 1, MPI_INT, rank + 1, 0, MPI_COMM_WORLD );
if ( rank == size - 1 ) {
for ( int i = 0; i < size - 1; i++ )
MPI_Send( &N, 1, MPI_INT, i, 1, MPI_COMM_WORLD );
} else {
MPI_Irecv( &buf, 1, MPI_INT, size - 1, 1, MPI_COMM_WORLD, &request );
int flag = false;
MPI_Status status;
while ( !flag ) {
MPI_Test( &request, &flag, &status );
sleep_s( 1 );
}
}
}
#endif
} else {
ut.expected_failure( "Testing thread performance with less than 1 processor" );
}
// Test first-in-first-out scheduler (also ensures priorities)
test_FIFO( ut, tpool );
// Test adding a work item with a dependency
barrier();
{
// Test that we sucessfully wait on the work items
std::vector<ThreadPool::thread_id_t> ids;
ids.reserve( 5 );
global_sleep_count = 0; // Reset the count before this test
ThreadPool::thread_id_t id0;
auto id1 = TPOOL_ADD_WORK( &tpool, sleep_inc, ( 1 ) );
auto id2 = TPOOL_ADD_WORK( &tpool, sleep_inc, ( 2 ) );
auto *wait1 = new WorkItemFull<bool, int>( check_inc, 1 );
auto *wait2 = new WorkItemFull<bool, int>( check_inc, 2 );
wait1->add_dependency( id0 );
wait1->add_dependency( id1 );
wait2->add_dependency( id1 );
wait2->add_dependency( id2 );
ids.clear();
ids.push_back( tpool.add_work( wait1 ) );
ids.push_back( tpool.add_work( wait2 ) );
tpool.wait_all( ids.size(), &ids[0] );
if ( !tpool.getFunctionRet<bool>( ids[0] ) || !tpool.getFunctionRet<bool>( ids[1] ) )
ut.failure( "Failed to wait on required dependency" );
else
ut.passes( "Dependencies" );
tpool.wait_pool_finished();
// Test waiting on more dependencies than in the thread pool (changing priorities)
ids.clear();
for ( size_t i = 0; i < 20; i++ )
ids.push_back( TPOOL_ADD_WORK( &tpool, sleep_inc2, ( 0.1 ) ) );
auto *wait3 = new WorkItemFull<void, double>( sleep_inc2, 0 );
wait3->add_dependencies( ids );
id = tpool.add_work( wait3, 50 );
tpool.wait( id );
bool pass = true;
for ( auto &id : ids )
pass = pass && id.finished();
ids.clear();
if ( pass )
ut.passes( "Dependencies2" );
else
ut.failure( "Dependencies2" );
// Check that we can handle more complex dependencies
id1 = TPOOL_ADD_WORK( &tpool, sleep_inc2, ( 0.5 ) );
for ( int i = 0; i < 10; i++ ) {
wait1 = new WorkItemFull<bool, int>( check_inc, 1 );
wait1->add_dependency( id1 );
tpool.add_work( wait1 );
}
tpool.wait_pool_finished();
ids.clear();
for ( int i = 0; i < 5; i++ )
ids.push_back( TPOOL_ADD_WORK( &tpool, sleep_inc2, ( 0.5 ) ) );
sleep_inc2( 0.002 );
ThreadPool::WorkItem *work = new WorkItemFull<void, int>( waste_cpu, 100 );
work->add_dependencies( ids );
id = tpool.add_work( work, 10 );
tpool.wait( id );
}
// Test the timing creating and running a work item
barrier();
{
printp( "Testing timmings (creating/running work item):\n" );
std::string timer_name = "Create/Run work item";
PROFILE_START( timer_name );
int64_t time_create = 0;
int64_t time_run = 0;
int64_t time_delete = 0;
std::vector<ThreadPool::WorkItem *> work( N_work );
start = std::chrono::high_resolution_clock::now();
for ( int n = 0; n < N_it; n++ ) {
auto t1 = std::chrono::high_resolution_clock::now();
for ( int i = 0; i < N_work; i++ )
work[i] = ThreadPool::createWork<void, int>( waste_cpu, data1[i] );
auto t2 = std::chrono::high_resolution_clock::now();
for ( int i = 0; i < N_work; i++ )
work[i]->run();
auto t3 = std::chrono::high_resolution_clock::now();
for ( int i = 0; i < N_work; i++ )
delete work[i];
auto t4 = std::chrono::high_resolution_clock::now();
time_create += to_ns( t2 - t1 );
time_run += to_ns( t3 - t2 );
time_delete += to_ns( t4 - t3 );
if ( ( n + 1 ) % 100 == 0 )
printp( "Cycle %i of %i finished\n", n + 1, N_it );
}
stop = std::chrono::high_resolution_clock::now();
time = std::chrono::duration<double>( stop - start ).count();
PROFILE_STOP( timer_name );
printp( " time = %0.0f ms\n", 1e3 * time );
printp( " time / cycle = %0.0f us\n", 1e6 * time / N_it );
printp( " average time / item = %0.0f ns\n", 1e9 * time / ( N_it * N_work ) );
printp( " create = %i ns\n", time_create / ( N_it * N_work ) );
printp( " run = %i ns\n", time_run / ( N_it * N_work ) );
printp( " delete = %i us\n", time_delete / ( N_it * N_work ) );
}
// Test the timing adding a single item
barrier();
for ( int it = 0; it < 2; it++ ) {
ThreadPool *tpool_ptr = nullptr;
std::string timer_name;
if ( it == 0 ) {
printp( "Testing timmings (adding a single item to empty tpool):\n" );
timer_name = "Add single item to empty pool";
tpool_ptr = &tpool0;
} else if ( it == 1 ) {
printp( "Testing timmings (adding a single item):\n" );
timer_name = "Add single item to tpool";
tpool_ptr = &tpool;
}
PROFILE_START( timer_name );
std::vector<ThreadPool::thread_id_t> ids( N_work );
int64_t time_add = 0;
int64_t time_wait = 0;
start = std::chrono::high_resolution_clock::now();
for ( int n = 0; n < N_it; n++ ) {
auto t1 = std::chrono::high_resolution_clock::now();
for ( int i = 0; i < N_work; i++ )
ids[i] = TPOOL_ADD_WORK( tpool_ptr, waste_cpu, ( data1[i] ), priority[i] );
auto t2 = std::chrono::high_resolution_clock::now();
tpool_ptr->wait_all( N_work, &ids[0] );
auto t3 = std::chrono::high_resolution_clock::now();
time_add += to_ns( t2 - t1 );
time_wait += to_ns( t3 - t2 );
if ( ( n + 1 ) % 100 == 0 )
printp( "Cycle %i of %i finished\n", n + 1, N_it );
}
stop = std::chrono::high_resolution_clock::now();
time = std::chrono::duration<double>( stop - start ).count();
PROFILE_STOP( timer_name );
printp( " time = %0.0f ms\n", 1e3 * time );
printp( " time / cycle = %0.0f us\n", 1e6 * time / N_it );
printp( " average time / item = %0.0f ns\n", 1e9 * time / ( N_it * N_work ) );
printp( " create and add = %i ns\n", time_add / ( N_it * N_work ) );
printp( " wait = %i us\n", time_wait / ( N_it * N_work ) );
}
// Test the timing pre-creating the work items and adding multiple at a time
barrier();
for ( int it = 0; it < 2; it++ ) {
ThreadPool *tpool_ptr = nullptr;
std::string timer_name;
if ( it == 0 ) {
printp( "Testing timmings (adding a block of items to empty tpool):\n" );
timer_name = "Add multiple items to empty pool";
tpool_ptr = &tpool0;
} else if ( it == 1 ) {
printp( "Testing timmings (adding a block of items):\n" );
timer_name = "Add multiple items to tpool";
tpool_ptr = &tpool;
}
PROFILE_START( timer_name );
int64_t time_create_work = 0;
int64_t time_add_work = 0;
int64_t time_wait_work = 0;
std::vector<ThreadPool::WorkItem *> work( N_work );
start = std::chrono::high_resolution_clock::now();
for ( int n = 0; n < N_it; n++ ) {
auto t1 = std::chrono::high_resolution_clock::now();
for ( int i = 0; i < N_work; i++ )
work[i] = ThreadPool::createWork<void, int>( waste_cpu, data1[i] );
auto t2 = std::chrono::high_resolution_clock::now();
auto ids = tpool_ptr->add_work( work, priority );
auto t3 = std::chrono::high_resolution_clock::now();
tpool_ptr->wait_all( ids );
auto t4 = std::chrono::high_resolution_clock::now();
time_create_work += to_ns( t2 - t1 );
time_add_work += to_ns( t3 - t2 );
time_wait_work += to_ns( t4 - t3 );
if ( ( n + 1 ) % 100 == 0 )
printp( "Cycle %i of %i finished\n", n + 1, N_it );
}
stop = std::chrono::high_resolution_clock::now();
time = std::chrono::duration<double>( stop - start ).count();
PROFILE_STOP( timer_name );
printp( " time = %0.0f ms\n", 1e3 * time );
printp( " time / cycle = %0.0f us\n", 1e6 * time / N_it );
printp( " average time / item = %0.0f ns\n", 1e9 * time / ( N_it * N_work ) );
printp( " create = %i ns\n", time_create_work / ( N_it * N_work ) );
printp( " add = %i ns\n", time_add_work / ( N_it * N_work ) );
printp( " wait = %i ns\n", time_wait_work / ( N_it * N_work ) );
}
// Run a dependency test that tests a simple case that should keep the thread pool busy
// Note: Checking the results requires looking at the trace data
tpool.wait_pool_finished();
PROFILE_START( "Dependency test" );
for ( int i = 0; i < 10; i++ ) {
char msg[3][100];
sprintf( msg[0], "Item %i-%i", i, 0 );
sprintf( msg[1], "Item %i-%i", i, 1 );
sprintf( msg[2], "Item %i-%i", i, 2 );
ThreadPool::WorkItem *work =
new WorkItemFull<void, double, std::string>( sleep_msg, 0.5, msg[0] );
ThreadPool::WorkItem *work1 =
new WorkItemFull<void, double, std::string>( sleep_msg, 0.1, msg[1] );
ThreadPool::WorkItem *work2 =
new WorkItemFull<void, double, std::string>( sleep_msg, 0.1, msg[2] );
ThreadPool::thread_id_t id = tpool.add_work( work );
work1->add_dependency( id );
work2->add_dependency( id );
tpool.add_work( work1 );
tpool.add_work( work2 );
}
tpool.wait_pool_finished();
PROFILE_STOP( "Dependency test" );
// Close the thread pool
tpool.setNumThreads( 0 );
// Save the profiling results
PROFILE_SAVE( "test_thread_pool" );
PROFILE_DISABLE();
// Test creating/destroying a thread pool using new
barrier();
pass = true;
try {
ThreadPool *tpool = new ThreadPool( ThreadPool::MAX_NUM_THREADS - 1 );
if ( tpool->getNumThreads() != ThreadPool::MAX_NUM_THREADS - 1 )
pass = false;
if ( !ThreadPool::is_valid( tpool ) )
pass = false;
delete tpool;
// Check that tpool is invalid
// Note: valgrind will report this as an invalid memory read, but we want to keep the test)
if ( ThreadPool::is_valid( tpool ) )
pass = false;
} catch ( ... ) {
pass = false;
}
if ( pass )
ut.passes( "Created/destroyed thread pool with new" );
else
ut.failure( "Created/destroyed thread pool with new" );
// Print the test results
barrier();
ut.report();
auto N_errors = static_cast<int>( ut.NumFailGlobal() );
// Shudown MPI
pout << "Shutting down\n";
barrier();
#ifdef USE_TIMER
if ( rank == 0 )
MemoryApp::print( pout );
#endif
#ifdef USE_MPI
MPI_Finalize();
sleep_ms( 10 );
#endif
return N_errors;
}